SPACE-TIME PREDICTION OF BIKE SHARE DEMAND

2025 Q1 VS 2024 Q4 HISTORICAL MODEL

Author

Tess Vu

Published

November 17, 2025

PART I: 2024 Q4 VS. 2025 Q1

1. Data Download

Indego Bikeshare Data Using quarters 2 and 3.

Iowa Environmental Mesonet (IEM) ASOS PHL Weather Station Downloaded for years aligning with Indego. Issue through riem library where it wouldn’t specifically download 03/2024 for some reason.

Code
library(tidyverse)
library(lubridate)
library(janitor)
library(zoo)
library(sf)
library(tigris)
library(tidycensus)
library(viridis)
library(knitr)
library(kableExtra)
library(patchwork)
library(scales)
library(showtext)
library(sysfonts)
library(glmnet)
library(fixest)

# Avoid spherical issues with joins.
sf_use_s2(FALSE)

# Load fonts
font_add_google("Outfit", "outfit")
font_add_google("Anonymous Pro", "anonymous")
showtext_opts(dpi = 300)
showtext_auto()

# Get rid of scientific notation.
options(scipen = 999)

knitr::opts_chunk$set(
  dev = "png",
  fig.path = "figures/"
)
Code
theme_plot <- function(base_size = 11) {
  theme_minimal(base_size = base_size) +
    theme(
      plot.title = element_text(face = "bold",
                                family = "outfit",
                                color = "#2d2a26",
                                size = base_size + 1,
                                hjust = 0.5
                                ),
      plot.subtitle = element_text(face = "italic",
                                   family = "outfit",
                                   color = "#51534a",
                                   size = base_size - 1,
                                   hjust = 0.5,
                                   margin = margin(b = 0.5, unit = "cm")
                                   ),
      plot.caption = element_text(face = "italic",
                                  family = "anonymous",
                                  color = "#9b9e98",
                                  size = base_size - 2
                                  ),
      legend.position = "bottom",
      panel.grid.major = element_line(colour = "#d4d2cd"),
      panel.grid.minor = element_line(colour = "#d4d2cd"),
      axis.text = element_text(face = "italic",
                               family = "anonymous",
                               size = base_size - 2,
                               hjust = 0.5
                               ),
      axis.title = element_text(face = "bold",
                                family = "anonymous",
                                size = base_size - 1,
                                hjust = 0.5
                                ),
      axis.title.y = element_text(margin = margin(r = 0.5, unit = "cm")
                                  ),
      axis.title.x = element_text(margin = margin(t = 0.5, unit = "cm")
                                  ),
      legend.title = element_text(face = "italic",
                                  family = "anonymous",
                                  size = base_size - 1,
                                  hjust = 0.5
                                  ),
      legend.title.position = "top",
      legend.text = element_text(face = "italic",
                                 family = "anonymous",
                                 size = base_size - 2,
                                 hjust = 0.5
                                 ),
      legend.key.width = unit(2, "cm"),
      legend.key.height = unit(0.5, "cm"),
      legend.background = element_rect(fill = "#f5f4f0", color = "#f5f4f0"),
      plot.background = element_rect(fill = "#f5f4f0", color = "#f5f4f0"),
      panel.background = element_rect(fill = "#f5f4f0", color = "#f5f4f0"),
      plot.margin = unit(c(1, 1, 1, 1), "cm")
    )
  }

theme_map <- function(base_size = 11) {
  theme_minimal(base_size = base_size) +
    theme(
      plot.title = element_text(face = "bold",
                                family = "outfit",
                                color = "#2d2a26",
                                size = base_size + 1,
                                hjust = 0.5
                                ),
      plot.subtitle = element_text(face = "italic",
                                   family = "outfit",
                                   color = "#51534a",
                                   size = base_size - 1,
                                   hjust = 0.5,
                                   margin = margin(b = 0.5, unit = "cm")
                                   ),
      plot.caption = element_text(face = "italic",
                                  family = "anonymous",
                                  color = "#9b9e98",
                                  size = base_size - 3
                                  ),
      legend.position = "bottom",
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      axis.text = element_blank(),
      axis.title = element_blank(),
      legend.title = element_text(face = "italic",
                                  family = "anonymous",
                                  size = base_size - 1,
                                  hjust = 0.5
                                  ),
      legend.title.position = "top",
      legend.text = element_text(face = "italic",
                                 family = "anonymous",
                                 size = base_size - 3,
                                 hjust = 0.5
                                 ),
      legend.key.width = unit(2, "cm"),
      legend.key.height = unit(0.5, "cm"),
      legend.background = element_rect(fill = "#f5f4f0", color = "#f5f4f0"),
      plot.background = element_rect(fill = "#f5f4f0", color = "#f5f4f0"),
      panel.background = element_rect(fill = "#f5f4f0", color = "#f5f4f0"),
      plot.margin = unit(c(1, 1, 1, 1), "cm")
    )
  }
Code
census_api_key <- Sys.getenv("CENSUS_API_KEY")

i. Load and Clean Bike Share Data

Code
# Load bike data.
bike_data <- read.csv("data/indego_2024_2025.csv")

# Cautious about various date formats.
date_formats <- c(
  "%m/%d/%Y %H:%M", # 1/1/2020 10:30
  "%Y-%m-%d %H:%M:%S", # 2020-01-01 10:30:00
  "%m/%d/%y %H:%M", # 1/1/20 10:30
  "%m/%d/%Y %I:%M:%S %p", # 1/1/2020 10:30:00 AM/PM
  "%m/%d/%Y %I:%M %p" # 1/1/2020 10:30 AM/PM
  )

# Parse dates and clean data.
bike_data <- bike_data %>%
  mutate(
    start_datetime_new = parse_date_time(start_time, orders = date_formats),
    end_datetime_new = parse_date_time(end_time, orders = date_formats)
    ) %>%
  filter(
    !is.na(start_datetime_new),
    !is.na(end_datetime_new),
    !is.na(start_station_id),
    duration > 0,
    start_lon >= -75.30, start_lon <= -74.95,
    start_lat >= 39.85, start_lat <= 40.20
    ) %>%
  mutate(
    start_time = start_datetime_new,
    end_time = end_datetime_new,
    date = as_date(start_time),
    year = year(start_time),
    interval60 = floor_date(start_time, unit = "hour"),
    quarter_year = paste0("Q", quarter(start_time), " ", year(start_time))
    ) %>%
  select(-c(start_datetime_new, end_datetime_new))

ii. Subset Data by Time Period

Code
# 2025 Q1.
bike_q1 <- bike_data %>%
  filter(year == 2025, quarter(start_time) == 1)

# Trips.
cat("2025 Q1 Trips:", format(nrow(bike_q1), big.mark = ","), "\n")
2025 Q1 Trips: 201,134 
Code
# Date range.
cat("Date Range:", format(min(bike_q1$date), "%Y-%m-%d"), 
    "to", format(max(bike_q1$date), "%Y-%m-%d"), "\n\n")
Date Range: 2025-01-01 to 2025-03-31 
Code
# Unique stations.
cat("Start Stations:", length(unique(bike_q1$start_station)), "\n")
Start Stations: 264 
Code
# Trip types.
table(bike_q1$trip_route_category)

   One Way Round Trip 
    190357      10777 
Code
# Passholder types.
table(bike_q1$passholder_type)

  Day Pass   Indego30  Indego365 IndegoFlex    Walk-up 
      5500      93800      91423          3      10408 
Code
# Bike types.
table(bike_q1$bike_type)

electric standard 
  129277    71857 
Code
# 2024 Q4.
bike_q4 <- bike_data

# 2024 Q4 (Q2-Q3 2024).
bike_q4 <- bike_data %>%
  filter(year == 2024, quarter(start_time) == 4)

# Trips.
cat("2024 Q4 Trips:", format(nrow(bike_q4), big.mark = ","), "\n")
2024 Q4 Trips: 167,225 
Code
# Date range.
cat("Date Range:", format(min(bike_q4$date), "%Y-%m-%d"),
    "to", format(max(bike_q4$date), "%Y-%m-%d"), "\n")
Date Range: 2024-10-01 to 2024-12-31 
Code
# Unique stations.
cat("Start Stations:", length(unique(bike_q4$start_station)), "\n")
Start Stations: 255 
Code
# Trip types.
table(bike_q4$trip_route_category)

   One Way Round Trip 
    157965       9260 
Code
# Passholder types.
table(bike_q4$passholder_type)

  Day Pass   Indego30  Indego365 IndegoFlex       NULL    Walk-up 
      5962      88922      63552          1        454       8334 
Code
# Bike types.
table(bike_q4$bike_type)

electric standard 
   98028    69197 

iii. Create Hourly Panel Data

Code
# Function to aggregate trips to station-hour level.
make_hourly_panel <- function(data_frame) {
  data_frame %>%
    mutate(
      date = as_date(start_time),
      dow = wday(date, label = TRUE, week_start = 1),
      is_weekend = dow %in% c("Sat", "Sun"),
      month = month(date),
      year = year(date),
      hour = hour(start_time),
      season = case_when(
        month %in% c(12, 1, 2) ~ "Winter",
        month %in% c(3, 4, 5) ~ "Spring",
        month %in% c(6, 7, 8) ~ "Summer",
        TRUE ~ "Fall"
      )
      ) %>%
    group_by(
      start_station_id, start_lat, start_lon,
      interval60, date, year, month, dow, hour, is_weekend, season
      ) %>%
    summarize(trips = n(), .groups = "drop")
}
Code
# Create base panels.
panel_q1_base <- make_hourly_panel(bike_q1)
panel_q4_base <- make_hourly_panel(bike_q4)

cat("2025 Q1 Base Panel:", format(nrow(panel_q1_base), big.mark = ","), "rows\n")
2025 Q1 Base Panel: 122,323 rows
Code
cat("2024 Q4 Base Panel:", format(nrow(panel_q4_base), big.mark = ","), "rows\n")
2024 Q4 Base Panel: 90,581 rows

2. Trip Visualizations

i. Daily Patterns

Code
# 2025 Q1 daily trips.
daily_trips_q1 <- panel_q1_base %>%
  group_by(date) %>%
  summarize(trips = sum(trips), .groups = "drop")

daily_trips_q1_plot <- ggplot(daily_trips_q1, aes(date, trips)) +
  geom_line(color = "#778ac5", linewidth = 1) +
  geom_smooth(se = FALSE, color = "#ff4100", linetype = "dashed") +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Indego Daily Trips",
    subtitle = "2025 Q1",
    x = "Date", y = "Trips"
  ) +
  theme_plot()

# 2024 Q4 daily trips.
daily_trips_q4 <- panel_q4_base %>%
  group_by(date) %>%
  summarize(trips = sum(trips), .groups = "drop")

daily_trips_q4_plot <- ggplot(daily_trips_q4, aes(date, trips)) +
  geom_line(color = "#778ac5", linewidth = 1) +
  geom_smooth(se = FALSE, color = "#ff4100", linetype = "dashed") +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Indego Daily Trips",
    subtitle = "2024 Q4",
    x = "Date", y = "Trips"
  ) +
  theme_plot()

daily_trips_q1_plot / daily_trips_q4_plot

3. Census Data Integration

i. Load and Clean Census Data

Code
# Get 2023 ACS data.
philly_census <- suppressMessages(suppressWarnings(get_acs(
  geography = "tract",
  variables = c(
    "B01003_001", # Total population.
    "B19013_001", # Median household income.
    "B08301_001", # Total commuters.
    "B08301_010", # Commute by public transportation.
    "B02001_002", # White alone.
    "B25077_001", # Median home value.
    "B15003_022", # Bachelor's degree or higher.
    "B08201_002", # No vehicle.
    "B01001_002", # Male population.
    "B01001_026", # Female population.
    "B15003_001", # Adult population 25+.
    "B25003_001" # Total households for car ownership.
  ),
  state = "PA",
  county = "Philadelphia",
  year = 2023,
  geometry = TRUE,
  output = "wide"
  ) %>%
  rename(
    total_pop = B01003_001E,
    med_inc = B19013_001E,
    total_commute = B08301_001E,
    public_commute = B08301_010E,
    white_pop = B02001_002E,
    med_h_val = B25077_001E,
    bach_plus = B15003_022E,
    no_car = B08201_002E,
    male_pop = B01001_002E,
    female_pop = B01001_026E,
    adult_pop = B15003_001E,
    total_hh = B25003_001E
  ) %>%
  mutate(
    pct_public_commute = public_commute / pmax(total_commute, 1),
    pct_white = white_pop / pmax(total_pop, 1),
    pct_bach_plus = bach_plus / pmax(adult_pop, 1),
    pct_no_car = no_car / pmax(total_hh, 1),
    pct_male = male_pop / pmax(total_pop, 1)
  ) %>%
  st_transform(4326)
  )
  )

  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |==========================================================            |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
Code
# Replace census placeholders with NA.
bad_placeholders <- c(-666666666, -999999999, -6666666, -999999)

philly_census <- philly_census %>%
  mutate(across(
    c(total_pop, med_inc, total_commute, public_commute, white_pop, med_h_val,
      bach_plus, no_car, male_pop, female_pop, adult_pop, total_hh), 
    ~ case_when(.x %in% bad_placeholders ~ NA_real_, TRUE ~ .x)
    )
    )

cat("Philadelphia Census Tracts:", nrow(philly_census), "\n")
Philadelphia Census Tracts: 408 

ii. Create Station-Census Lookup Table

Code
# Unique station locations.
stations_geo <- bike_q4 %>%
  group_by(start_station_id) %>%
  summarize(
    start_lat = first(start_lat),
    start_lon = first(start_lon),
    .groups = "drop"
    ) %>%
  filter(!is.na(start_lat), !is.na(start_lon))

# Convert to sf object.
stations_sf <- stations_geo %>%
  st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326)

# Join stations to census.
station_census_raw <- suppressMessages(
  st_join(stations_sf, philly_census, join = st_intersects, left = TRUE)
  )

# Remove duplicates keep only first match per station.
station_census_lookup <- station_census_raw %>%
  st_drop_geometry() %>%
  arrange(start_station_id, med_inc) %>%
  group_by(start_station_id) %>%
  slice(1) %>% # Keep only first match per station.
  ungroup()%>%
  select(start_station_id, med_inc, pct_public_commute, pct_white, total_pop,
         pct_bach_plus, pct_no_car, pct_male) %>%
  filter(!is.na(med_inc)) # Keep only residential stations, though not ideal real-life representation.

# Get valid station list.
valid_stations <- station_census_lookup$start_station_id

# Add coordinates back.
station_census_lookup <- station_census_lookup %>%
  left_join(stations_geo, by = "start_station_id")

# Verify no duplicates.
n_duplicates <- station_census_lookup %>%
  count(start_station_id) %>%
  filter(n > 1) %>%
  nrow()

cat("Valid Residential Stations:", length(valid_stations), "\n")
Valid Residential Stations: 235 
Code
cat("Duplicate Stations:", n_duplicates, "\n")
Duplicate Stations: 0 
Code
stations_q4 <- bike_q4 %>%
  distinct(start_station_id, start_lat, start_lon) %>%
  filter(!is.na(start_lat), !is.na(start_lon))

med_inc_station_map <- ggplot() +
  geom_sf(data = philly_census, aes(fill = med_inc), color = NA) +
  scale_fill_viridis(
    option = "mako",
    direction = -1,
    name   = "Median Income",
    labels = dollar,
    na.value = "#9b9e98"
    ) +
  geom_point(
    data = stations_q4,
    aes(x = start_lon, y = start_lat),
    color = "#ff4100", size = 1, alpha = 0.8
    ) +
  labs(
    title = "Indego Stations",
    subtitle = "Philadelphia, PA",
    caption = "Color: Median household income by census tracts."
    ) +
  theme_map()

med_inc_station_map

iii. Map Stations and Census Context

Code
# Stations and median income map.
med_inc_station_map <- ggplot() +
  geom_sf(data = philly_census, aes(fill = med_inc), color = NA) +
  scale_fill_viridis(
    option = "mako",
    direction = -1,
    name = "Median Income",
    labels = dollar,
    na.value = "#9b9e98"
    ) +
  geom_point(
    data = stations_geo,
    aes(x = start_lon, y = start_lat),
    color = "#ff4100", size = 1, alpha = 0.8
    ) +
  labs(
    title = "Indego Stations",
    subtitle = "Philadelphia, PA"
    ) +
  theme_map()

# Missing stations map.
stations_for_map <- stations_geo %>%
  left_join(
    station_census_lookup %>% select(start_station_id, med_inc),
    by = "start_station_id"
    ) %>%
  mutate(has_census = !is.na(med_inc))

missing_station_map <- ggplot() +
  geom_sf(data = philly_census, aes(fill = med_inc), color = NA) +
  scale_fill_viridis(
    option = "mako",
    direction = -1,
    name = "Median Income",
    labels = dollar,
    na.value = "#9b9e98"
    ) +
  geom_point(
    data = stations_for_map %>% filter(has_census),
    aes(start_lon, start_lat),
    color = "#51534a", size = 1, alpha = 0.6
    ) +
  geom_point(
    data = stations_for_map %>% filter(!has_census),
    aes(start_lon, start_lat),
    color = "#ff4100", size = 1, shape = 4, stroke = 1
    ) +
  labs(
    title = "Indego Stations and Census Matches",
    subtitle = "Philadelphia, PA",
    caption = "Orange X: Stations without tract-level demographics.\nColor: Median household income by census tract."
    ) +
  theme_map()

(med_inc_station_map | missing_station_map) + 
  plot_layout(guides = "collect") & 
  theme(legend.position = "bottom",
        legend.direction = "horizontal")

4. Weather Data Integration

i. Load and Clean Weather Data

Code
# Load weather data from IEM ASOS, had issues fetching with riem library likely due to large volume request.
weather_data <- read.csv("data/PHL.csv") %>%
  select(valid, tmpf, dwpf, relh, sknt, p01i, vsby, gust, wxcodes, feel) %>%
  rename(
    interval60 = valid,
    temp = tmpf,
    dew = dwpf,
    humid = relh,
    wind = sknt,
    precip = p01i,
    visibility = vsby,
    w_code = wxcodes
    )

# Replace null strings with NA.
weather_data[weather_data == "null"] <- NA

# Clean and interpolate weather data.
weather_clean <- weather_data %>%
  # Make sure interval60 types align and have same time zone.
  mutate(
    interval60 = as.POSIXct(interval60, format = "%Y-%m-%d %H:%M", tz = "UTC"),
    # Make sure this rounds to hour or else the merge will NA almost everything.
    interval60 = floor_date(interval60, unit = "hour"),
    # Numeric conversion.
    temp = as.numeric(temp),
    precip = as.numeric(precip),
    dew = as.numeric(dew),
    humid = as.numeric(humid),
    wind = as.numeric(wind),
    visibility = as.numeric(visibility),
    feel = as.numeric(feel),
    gust = as.numeric(gust)
    ) %>%
  # Keep unique times.
  distinct(interval60, .keep_all = TRUE) %>%
  arrange(interval60) %>%
  mutate(
    # Interpolation should be okay due to small number of missingness and low hourly variation.
    temp = na.approx(temp, na.rm = FALSE),
    dew = na.approx(dew, na.rm = FALSE),
    humid = na.approx(humid, na.rm = FALSE),
    wind = na.approx(wind, na.rm = FALSE),
    visibility = na.approx(visibility, na.rm = FALSE),
    feel = na.approx(feel, na.rm = FALSE),
    precip = ifelse(is.na(precip), 0, precip),
    gust = ifelse(is.na(gust), 0, gust),
    w_code = case_when(
      is.na(w_code) | w_code == "" ~ "CLR",
      TRUE ~ w_code
    )
  )

cat("Weather:", format(nrow(weather_clean), big.mark = ","), "\n")
Weather: 95,458 

ii. Weather Pattern Visualization

Code
# 2025 Q1 daily temperature.
weather_q1 <- weather_clean %>%
  filter(
    interval60 >= min(panel_q1_base$interval60),
    interval60 <= max(panel_q1_base$interval60)
    )

weather_q1_plot <- ggplot(weather_q1, aes(interval60, temp)) +
  geom_line(color = "#00a557") +
  geom_smooth(se = FALSE, color = "#ff4100") +
  labs(
    title = "Philadelphia Temperature",
    subtitle = "2025 Q1",
    x = "Date", y = "Temperature (°F)"
    ) +
  theme_plot()

# 2024 Q4 daily temperature.
weather_q4_daily <- weather_clean %>%
  filter(
    interval60 >= min(panel_q4_base$interval60),
    interval60 <= max(panel_q4_base$interval60)
    )

weather_q4_daily_plot <- ggplot(weather_q4_daily, aes(interval60, temp)) +
  geom_line(color = "#00a557") +
  geom_smooth(se = FALSE, color = "#ff4100") +
  labs(
    title = "Philadelphia Temperature",
    subtitle = "2024 Q4",
    x = "Date", y = "Temperature (°F)"
    ) +
  theme_plot()

# 2024 Q4 monthly temperature, more stable and granular.
weather_q4_monthly <- weather_clean %>%
  group_by(year = year(interval60), month = month(interval60)) %>%
  summarize(
    temp = mean(temp, na.rm = TRUE),
    date_month = make_date(year, month, 1),
    .groups = "drop"
    )

weather_q1_plot / weather_q4_daily_plot

5. Build Complete Space-Time Panel

i. 2025 Q1 Complete Panel

Code
# Filter to valid stations.
panel_q1_base <- panel_q1_base %>%
  filter(start_station_id %in% valid_stations)

# Get unique times and stations.
unique_times_q1 <- unique(panel_q1_base$interval60) %>% sort()
unique_stations_q1 <- valid_stations

cat("2025 Q1 Complete Panel:\n")
2025 Q1 Complete Panel:
Code
cat("Stations:", length(unique_stations_q1), "\n")
Stations: 235 
Code
cat("Time Periods:", format(length(unique_times_q1), big.mark = ","), "\n")
Time Periods: 2,151 
Code
cat("Expected Rows:", format(length(unique_stations_q1) * length(unique_times_q1), big.mark = ","))
Expected Rows: 505,485
Code
# Create complete grid.
complete_grid_q1 <- expand.grid(
  interval60 = unique_times_q1,
  start_station_id = unique_stations_q1,
  KEEP.OUT.ATTRS = FALSE,
  stringsAsFactors = FALSE
  )

# Get time variables from interval60.
complete_grid_q1 <- complete_grid_q1 %>%
  mutate(
    date = as_date(interval60),
    year = year(interval60),
    month = month(interval60),
    dow = wday(date, label = TRUE, week_start = 1),
    hour = hour(interval60),
    is_weekend = ifelse(dow %in% c("Sat", "Sun"), 1, 0),
    rush_hour = ifelse(hour %in% c(7, 8, 9, 16, 17, 18), 1, 0),
    season = case_when(
      month %in% c(12, 1, 2) ~ "Winter",
      month %in% c(3, 4, 5) ~ "Spring",
      month %in% c(6, 7, 8) ~ "Summer",
      TRUE ~ "Fall"
      )
    )
Code
# Merge trip counts.
panel_q1_with_trips <- complete_grid_q1 %>%
  left_join(
    panel_q1_base %>% select(start_station_id, interval60, trips),
    by = c("start_station_id", "interval60")
    ) %>%
  mutate(trips = replace_na(trips, 0))

# Add station attributes.
panel_q1_with_station <- panel_q1_with_trips %>%
  left_join(station_census_lookup, by = "start_station_id")

# Add weather data.
panel_q1_with_weather <- panel_q1_with_station %>%
  left_join(weather_clean, by = "interval60")
Code
# Calculate temporal lags.
panel_q1 <- panel_q1_with_weather %>%
  arrange(start_station_id, interval60) %>%
  group_by(start_station_id) %>%
  mutate(
    lag_1hr = lag(trips, 1),
    lag_2hr = lag(trips, 2),
    lag_3hr = lag(trips, 3),
    lag_12hr = lag(trips, 12),
    lag_1day = lag(trips, 24),
    avg_7day = rollapply(trips, 168, mean, fill = NA, align = "right")
  ) %>%
  ungroup() %>%
  filter(!is.na(avg_7day))

cat("Final 2025 Q1 Panel:", format(nrow(panel_q1), big.mark = ","), "\n")
Final 2025 Q1 Panel: 466,240 

ii. 2024 Q4 Complete Panel

Code
# Filter to valid stations.
panel_q4_base <- panel_q4_base %>%
  filter(start_station_id %in% valid_stations)

# Get unique times and stations.
unique_times_q4 <- unique(panel_q4_base$interval60) %>% sort()
unique_stations_q4 <- valid_stations

cat("2024 Q4 Complete Panel:\n")
2024 Q4 Complete Panel:
Code
cat("Stations:", length(unique_stations_q4), "\n")
Stations: 235 
Code
cat("Time Periods:", format(length(unique_times_q4), big.mark = ","), "\n")
Time Periods: 1,343 
Code
cat( "Expected Rows", format(length(unique_stations_q4) * length(unique_times_q4), big.mark = ","), "\n\n" )
Expected Rows 315,605 
Code
# Create complete grid.
complete_grid_q4 <- expand.grid(
  interval60 = unique_times_q4,
  start_station_id = unique_stations_q4,
  KEEP.OUT.ATTRS = FALSE,
  stringsAsFactors = FALSE
)

# Get time variables from interval60.
complete_grid_q4 <- complete_grid_q4 %>%
  mutate(
    date = as_date(interval60),
    year = year(interval60),
    month = month(interval60),
    dow = wday(date, label = TRUE, week_start = 1),
    hour = hour(interval60),
    is_weekend = ifelse(dow %in% c("Sat", "Sun"), 1, 0),
    rush_hour = ifelse(hour %in% c(7, 8, 9, 16, 17, 18), 1, 0),
    season = case_when(
      month %in% c(12, 1, 2) ~ "Winter",
      month %in% c(3, 4, 5) ~ "Spring",
      month %in% c(6, 7, 8) ~ "Summer",
      TRUE ~ "Fall"
    )
  )
Code
# Merge trip counts.
panel_q4_with_trips <- complete_grid_q4 %>%
  left_join(
    panel_q4_base %>% select(start_station_id, interval60, trips),
    by = c("start_station_id", "interval60")
  ) %>%
  mutate(trips = replace_na(trips, 0))

# Add station attributes.
panel_q4_with_station <- panel_q4_with_trips %>%
  left_join(station_census_lookup, by = "start_station_id")

# Add weather data.
panel_q4_with_weather <- panel_q4_with_station %>%
  left_join(weather_clean, by = "interval60")
Code
# Temporal lags.
panel_q4 <- panel_q4_with_weather %>%
  arrange(start_station_id, interval60) %>%
  group_by(start_station_id) %>%
  mutate(
    lag_1hr = lag(trips, 1),
    lag_2hr = lag(trips, 2),
    lag_3hr = lag(trips, 3),
    lag_12hr = lag(trips, 12),
    lag_1day = lag(trips, 24),
    avg_7day = rollapply(trips, 168, mean, fill = NA, align = "right")
  ) %>%
  ungroup() %>%
  filter(!is.na(avg_7day))

cat("Final 2024 Q4 Panel:", format(nrow(panel_q4), big.mark = ","), "\n")
Final 2024 Q4 Panel: 276,360 

6. Visualize Temporal Patterns

i. Lag Plots

Code
# 2024 Q4 lag patterns.
lag_data_long_q4 <- panel_q4 %>%
  filter(start_station_id == 3010) %>%
  head(168) %>%
  select(interval60, current_trips = trips, lag_1hr, lag_2hr, lag_3hr, lag_12hr, lag_1day) %>%
  pivot_longer(
    cols = starts_with("lag"),
    names_to = "lag_type",
    values_to = "lag_value"
    ) %>%
  mutate(lag_type = factor(
    lag_type,
    levels = c("lag_1hr", "lag_2hr", "lag_3hr", "lag_12hr", "lag_1day"),
    labels = c("1 Hour Ago", "2 Hours Ago", "3 Hours Ago", "12 Hours Ago", "24 Hours Ago")
    )
    )

lag_facet_plot_q4 <- ggplot(lag_data_long_q4, aes(x = interval60)) +
  geom_line(aes(y = current_trips, color = "Current Demand"), linewidth = 0.75, alpha = 0.8) +
  geom_line(aes(y = lag_value, color = "Lagged Demand"), linewidth = 0.75, linetype = "dashed") +
  facet_wrap(~ lag_type, scales = "free_x", ncol = 2) +
  scale_color_manual(
    name = NULL,
    values = c("Current Demand" = "#4A6F53", "Lagged Demand" = "#f48f33")
    ) +
  labs(
    title = "One Week Short-Term vs. Daily Temporal Lags",
    subtitle = "2024 Q4",
    caption = "Station 3010: 15th & Spruce",
    x = "Date and Time",
    y = "Average Trip Count"
    ) +
  theme_plot() +
  theme(strip.text = element_text(family = "anonymous", face = "bold", size = 10))

# 2025 Q1 lag patterns.
lag_data_long_q1 <- panel_q1 %>%
  filter(start_station_id == 3010) %>%
  head(168) %>%
  select(interval60, current_trips = trips, lag_1hr, lag_2hr, lag_3hr, lag_12hr, lag_1day) %>%
  pivot_longer(
    cols = starts_with("lag"),
    names_to = "lag_type",
    values_to = "lag_value"
  ) %>%
  mutate(lag_type = factor(
    lag_type,
    levels = c("lag_1hr", "lag_2hr", "lag_3hr", "lag_12hr", "lag_1day"),
    labels = c("1 Hour Ago", "2 Hours Ago", "3 Hours Ago", "12 Hours Ago", "24 Hours Ago")
  ))

lag_facet_plot_q1 <- ggplot(lag_data_long_q1, aes(x = interval60)) +
  geom_line(aes(y = current_trips, color = "Current Demand"), linewidth = 0.75, alpha = 0.8) +
  geom_line(aes(y = lag_value, color = "Lagged Demand"), linewidth = 0.75, linetype = "dashed") +
  facet_wrap(~ lag_type, scales = "free_x", ncol = 2) +
  scale_color_manual(
    name = NULL,
    values = c("Current Demand" = "#4A6F53", "Lagged Demand" = "#f48f33")
  ) +
  labs(
    title = "One Week Short-Term vs. Daily Temporal Lags",
    subtitle = "2025 Q1",
    x = "Date and Time",
    y = "Trip Count"
  ) +
  theme_plot() +
  theme(strip.text = element_text(family = "anonymous", face = "bold", size = 10))

(lag_facet_plot_q1 / lag_facet_plot_q4) +
  plot_layout(guides = "collect") & 
  theme(legend.position = "bottom",
        legend.direction = "horizontal")

ii. Hourly Ridership Patterns

Code
# 2024 Q4 hourly patterns.
hourly_patterns_q4 <- panel_q4 %>%
  group_by(hour, is_weekend) %>%
  summarize(avg_trips = mean(trips, na.rm = TRUE), .groups = "drop") %>%
  mutate(day_type = ifelse(is_weekend, "Weekend", "Weekday"))

hourly_patterns_q4_plot <- ggplot(hourly_patterns_q4, aes(x = hour, y = avg_trips, color = day_type)) +
  geom_line(linewidth = 1.5) +
  scale_color_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
  labs(
    title = "Average Hourly Ridership Patterns",
    subtitle = "2024 Q4",
    x = "Hour of Day",
    y = "Average Trips per Hour",
    color = "Day Type"
    ) +
  theme_plot()

# 2025 Q1 hourly patterns
hourly_patterns_q1 <- panel_q1 %>%
  group_by(hour, is_weekend) %>%
  summarize(avg_trips = mean(trips, na.rm = TRUE), .groups = "drop") %>%
  mutate(day_type = ifelse(is_weekend, "Weekend", "Weekday"))

hourly_patterns_q1_plot <- ggplot(hourly_patterns_q1, aes(x = hour, y = avg_trips, color = day_type)) +
  geom_line(linewidth = 1.5) +
  scale_color_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
  labs(
    title = "Average Hourly Ridership Patterns",
    subtitle = "2025 Q1",
    caption = "Average trips per station",
    x = "Hour of Day",
    y = "Average Trips per Hour",
    color = "Day Type"
    ) +
  theme_plot()

hourly_patterns_q4_plot / hourly_patterns_q1_plot

iii. Top Stations Table

Code
top_stations <- bike_q4 %>%
  count(start_station_id, start_lat, start_lon, name = "trips") %>%
  arrange(desc(trips))

# Format trips column to include commas.
top_stations_kable <- top_stations %>%
  mutate(
    trips = scales::comma(trips)
    ) %>%
  kable(
    caption = "Top 20 Indego Stations by Trip Origins (2024 Q4)",
    col.names = c("Station ID", "Latitude", "Longitude", "Total Trips"),
    align = c("c", "c", "c", "r")
    ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
    ) %>%
  footnote(
    general = "Total trips originating from the station during 2024 Q4 period.",
    general_title = "Note:",
    footnote_as_chunk = FALSE
    )

top_stations_kable
Top 20 Indego Stations by Trip Origins (2024 Q4)
Station ID Latitude Longitude Total Trips
3010 39.94711 -75.16618 3,412
3032 39.94527 -75.17971 2,508
3359 39.94888 -75.16978 2,229
3244 39.93865 -75.16674 2,012
3295 39.95028 -75.16027 1,892
3028 39.94061 -75.14958 1,885
3020 39.94855 -75.19007 1,882
3066 39.94561 -75.17348 1,868
3054 39.96250 -75.17420 1,856
3208 39.95048 -75.19324 1,853
3101 39.94295 -75.15955 1,808
3362 39.94816 -75.16226 1,734
3022 39.95472 -75.18323 1,723
3012 39.94218 -75.17747 1,669
3185 39.95169 -75.15888 1,663
3256 39.95269 -75.17779 1,659
3059 39.96244 -75.16121 1,657
3063 39.94633 -75.16980 1,608
3007 39.94517 -75.15993 1,589
3061 39.95425 -75.17761 1,577
3301 39.95029 -75.17126 1,553
3203 39.94077 -75.17227 1,546
3052 39.94732 -75.15695 1,518
3030 39.93935 -75.15716 1,512
3161 39.95486 -75.18091 1,512
3102 39.96759 -75.17952 1,495
3034 39.93315 -75.16248 1,479
3046 39.95012 -75.14472 1,477
3038 39.94781 -75.19409 1,427
3163 39.94974 -75.18097 1,416
3100 39.92777 -75.15103 1,340
3363 39.94655 -75.16263 1,335
3051 39.96744 -75.17507 1,332
3058 39.96716 -75.17001 1,283
3026 39.94182 -75.14550 1,263
3294 39.95174 -75.17063 1,262
3304 39.94234 -75.15399 1,254
3264 39.94571 -75.15508 1,236
3025 39.93724 -75.16120 1,167
3018 39.95273 -75.15979 1,157
3033 39.95005 -75.15672 1,152
3164 39.92814 -75.16515 1,135
3009 39.95576 -75.18982 1,133
3201 39.95523 -75.16620 1,112
3255 39.95095 -75.16438 1,100
3064 39.93840 -75.17327 1,087
3156 39.95381 -75.17407 1,079
3207 39.95441 -75.19200 1,071
3029 39.95380 -75.19479 1,067
3205 39.95405 -75.16783 1,051
3154 39.95924 -75.15821 1,045
3296 39.95134 -75.16758 1,033
3037 39.95424 -75.16138 1,028
3361 39.93082 -75.17474 1,020
3157 39.92545 -75.15954 1,005
3114 39.93775 -75.18012 1,004
3318 39.95182 -75.16395 1,004
3078 39.95355 -75.17192 1,003
3069 39.93704 -75.15038 996
3053 39.93231 -75.18154 987
3346 39.94883 -75.15097 981
3333 39.95410 -75.16965 974
3340 39.96906 -75.18169 969
3360 39.93304 -75.16822 967
3057 39.96439 -75.17987 960
3299 39.94332 -75.15698 942
3055 39.95112 -75.15457 941
3165 39.95819 -75.17820 931
3212 39.96383 -75.18177 930
3382 39.95038 -75.17309 927
3125 39.94391 -75.16735 917
3021 39.95390 -75.16902 912
3040 39.96289 -75.16606 912
3159 39.95121 -75.19962 911
3072 39.93445 -75.14541 874
3374 39.97280 -75.17971 850
3098 39.93431 -75.16042 842
3373 39.95817 -75.15668 813
3162 39.94595 -75.18475 801
3182 39.95081 -75.16953 801
3075 39.96718 -75.16125 796
3015 39.94735 -75.14886 785
3150 39.92884 -75.17021 781
3160 39.95662 -75.19862 780
3088 39.96984 -75.14180 767
3041 39.96849 -75.13546 766
3068 39.93549 -75.16711 757
3204 39.96437 -75.16582 755
3120 39.97522 -75.18669 752
3248 39.94213 -75.18335 747
3074 39.95511 -75.20987 735
3322 39.93638 -75.15526 735
3006 39.95220 -75.20311 713
3235 39.96000 -75.16510 710
3336 39.95719 -75.19388 710
3155 39.94018 -75.15442 708
3245 39.97887 -75.15777 704
3371 39.95340 -75.15430 693
3073 39.96143 -75.15242 691
3047 39.95073 -75.14947 678
3168 39.95134 -75.17394 676
3005 39.94733 -75.14403 675
3166 39.97195 -75.13445 675
3345 39.96170 -75.14667 669
3169 39.95382 -75.14263 656
3086 39.94019 -75.16691 655
3297 39.95821 -75.16901 653
3049 39.94509 -75.14250 651
3197 39.92083 -75.17033 649
3035 39.96271 -75.19419 648
3158 39.92552 -75.16904 647
3314 39.93682 -75.14492 632
3342 39.96390 -75.17385 631
3347 39.95484 -75.15449 623
3116 39.96006 -75.17198 622
3393 39.96727 -75.13999 621
3062 39.95197 -75.17943 610
3187 39.95725 -75.17232 610
3031 39.98005 -75.15522 608
3249 39.95784 -75.19079 601
3099 39.93401 -75.15094 592
3261 39.96304 -75.14099 584
3395 39.93866 -75.16351 583
3300 39.95511 -75.16287 578
3365 39.96173 -75.18788 572
3375 39.96036 -75.14020 568
3110 39.96175 -75.13641 536
3210 39.98492 -75.15668 534
3266 39.92370 -75.14701 531
3321 39.95485 -75.17298 529
3124 39.95367 -75.13956 518
3276 39.96569 -75.15679 509
3236 39.96566 -75.17765 498
3328 39.98798 -75.19950 488
3394 39.92400 -75.16959 487
3056 39.97669 -75.15813 483
3286 39.96409 -75.14769 476
3097 39.97888 -75.13339 471
3017 39.98003 -75.14371 470
3252 39.93882 -75.19324 468
3303 39.95590 -75.20530 463
3104 39.96664 -75.19209 462
3112 39.95373 -75.21825 453
3341 39.98131 -75.12732 450
3387 39.97012 -75.14666 449
3349 39.93651 -75.18621 447
3383 39.96634 -75.14222 439
3209 39.94900 -75.21278 437
3008 39.98081 -75.15067 425
3257 39.92160 -75.15000 418
3285 39.95946 -75.14745 408
3320 39.97460 -75.15917 403
3184 39.94573 -75.19551 402
3337 39.97318 -75.14612 400
3397 39.93906 -75.17824 398
3378 39.95238 -75.14728 397
3279 39.92650 -75.16680 394
3275 39.97576 -75.12157 386
3259 39.95998 -75.19883 384
3039 39.97157 -75.15993 382
3014 39.95886 -75.17369 380
3060 39.95923 -75.17036 377
3331 39.97116 -75.12813 360
3077 39.97207 -75.16351 354
3211 39.97172 -75.17060 345
3396 39.92327 -75.18210 345
3246 39.94782 -75.22301 342
3237 39.91717 -75.17096 322
3115 39.97263 -75.16757 321
3024 39.94822 -75.20908 317
3238 39.94628 -75.15138 317
3265 39.95957 -75.19633 316
3319 39.96672 -75.12964 306
3369 40.00772 -75.19194 305
3019 39.95403 -75.14983 294
3067 39.96411 -75.19973 292
3188 39.90471 -75.17340 289
3258 39.95828 -75.22424 288
3254 39.98091 -75.16065 287
3324 39.99484 -75.19242 273
3118 39.95866 -75.21323 272
3267 39.91800 -75.17960 271
3250 39.95673 -75.14575 266
3381 39.94845 -75.21807 259
3226 39.98101 -75.13721 254
3376 39.97183 -75.14878 254
3119 39.96674 -75.20799 233
3291 40.01022 -75.19700 232
3152 39.94993 -75.20286 229
3200 39.97583 -75.16846 224
3370 40.01630 -75.21241 220
3263 39.95860 -75.20810 219
3241 39.95225 -75.22637 214
3310 39.98024 -75.18651 212
3253 39.93174 -75.18996 206
3107 39.98203 -75.18866 204
3313 39.91580 -75.16430 198
3357 39.91040 -75.16588 191
3325 39.97680 -75.12764 178
3335 40.02237 -75.21800 177
3233 39.95501 -75.15208 175
3392 39.93633 -75.19447 170
3093 39.98837 -75.18701 162
3277 39.98306 -75.16738 159
3123 39.98004 -75.17088 158
3353 40.00496 -75.15233 158
3344 39.95961 -75.23354 156
3287 39.94367 -75.21606 152
3016 39.96892 -75.15470 151
3368 39.97647 -75.17362 149
3251 39.93366 -75.19319 144
3065 39.97070 -75.15171 143
3350 39.99415 -75.15527 136
3243 39.98350 -75.14743 135
3338 39.95098 -75.23035 124
3268 39.97633 -75.15101 121
3390 39.96000 -75.22440 121
3111 39.97779 -75.21323 116
3388 39.96634 -75.15097 115
3326 39.99198 -75.19944 112
3117 39.97841 -75.22399 110
3271 39.94760 -75.22946 108
3327 39.97759 -75.12442 101
3183 39.88994 -75.17679 99
3329 40.02651 -75.22445 96
3106 39.99179 -75.18637 95
3113 39.97472 -75.19781 93
3281 39.93230 -75.21620 92
3298 39.94717 -75.20691 91
3354 39.95628 -75.23870 88
3391 39.99667 -75.23482 84
3096 39.99119 -75.17975 82
3284 40.01112 -75.19287 82
3356 39.89322 -75.17187 79
3323 40.02796 -75.22770 78
3272 39.96405 -75.22048 75
3315 39.94235 -75.21138 65
3306 39.99683 -75.17761 64
3247 39.96538 -75.21514 55
3307 40.00280 -75.17400 47
3181 39.89629 -75.17514 44
3280 39.93968 -75.21362 43
3282 39.98628 -75.14925 43
3240 39.96277 -75.21185 42
3278 39.99301 -75.16810 41
3398 39.91551 -75.15474 41
3332 40.00537 -75.18143 32
3196 39.98717 -75.17477 31
3352 40.00143 -75.15256 21
3351 39.99655 -75.14991 20
3305 39.99569 -75.16710 9
3000 39.91591 -75.18370 7
3334 39.93020 -75.15636 3
3292 39.92000 -75.15620 1
3366 39.96693 -75.23359 1
Note:
Total trips originating from the station during 2024 Q4 period.

7. Train/Test Split

i. 2024 Q4 Data Split

Code
cat("2024 Q4 Train/Test Split\n")
2024 Q4 Train/Test Split
Code
# Add week number.
panel_q4 <- panel_q4 %>%
  mutate(week = as.numeric(week(date)))

# Set the minimum week for the test set (Week 49 = beginning of December)
split_date <- 49

# Stations in both periods.
early_stations <- panel_q4 %>%
  filter(week < split_date) %>%
  distinct(start_station_id) %>%
  pull(start_station_id)

late_stations <- panel_q4 %>%
  filter(week >= split_date) %>%
  distinct(start_station_id) %>%
  pull(start_station_id)

common_stations_q4 <- intersect(early_stations, late_stations)

cat("Early Stations (Weeks < 49):", length(early_stations), "\n")
Early Stations (Weeks < 49): 235 
Code
cat("Late Stations (Weeks >= 49):", length(late_stations), "\n")
Late Stations (Weeks >= 49): 235 
Code
cat("Both Period Stations:", length(common_stations_q4), "\n")
Both Period Stations: 235 
Code
# Train and test sets.
train <- panel_q4 %>%
  filter(start_station_id %in% common_stations_q4, week < split_date)

test <- panel_q4 %>%
  filter(start_station_id %in% common_stations_q4, week >= split_date)

cat("\n2024 Q4 Training Observations:", format(nrow(train), big.mark = ","), "\n")

2024 Q4 Training Observations: 170,375 
Code
cat("2024 Q4 Testing Observations:", format(nrow(test), big.mark = ","), "\n")
2024 Q4 Testing Observations: 105,985 
Code
cat("2024 Q4 Training Date Range:", format(min(train$date), big.mark = ","), "to", format(max(train$date), big.mark = ","), "\n")
2024 Q4 Training Date Range: 2024-10-19 to 2024-12-01 
Code
cat("2024 Q4 Testing Date Range:", format(min(test$date), big.mark = ","), "to", format(max(test$date), big.mark = ","), "\n")
2024 Q4 Testing Date Range: 2024-12-13 to 2024-12-31 

ii. 2025 Q1 Data Split

Code
cat("2025 Q1 Train/Test Split\n")
2025 Q1 Train/Test Split
Code
# Add week number.
panel_q1 <- panel_q1 %>%
  mutate(week = as.numeric(week(date)))

min_week <- 10

# Stations in both periods.
early_stations_q1 <- panel_q1 %>%
  filter(week < min_week) %>%
  distinct(start_station_id) %>%
  pull(start_station_id)

late_stations_q1 <- panel_q1 %>%
  filter(week >= min_week) %>%
  distinct(start_station_id) %>%
  pull(start_station_id)

common_stations_q1 <- intersect(early_stations_q1, late_stations_q1)

cat("Early Stations (Weeks < 10):", length(early_stations_q1), "\n")
Early Stations (Weeks < 10): 235 
Code
cat("Late Stations (Weeks >= 10):", length(late_stations_q1), "\n")
Late Stations (Weeks >= 10): 235 
Code
cat("Both Period Stations:", length(common_stations_q1), "\n")
Both Period Stations: 235 
Code
# Train and test sets.
train_q1 <- panel_q1 %>%
  filter(start_station_id %in% common_stations_q1, week < min_week)

test_q1 <- panel_q1 %>%
  filter(start_station_id %in% common_stations_q1, week >= min_week)

cat("\n2025 Q1 Training Observations:", format(nrow(train_q1), big.mark = ","), "\n")

2025 Q1 Training Observations: 314,195 
Code
cat("2025 Q1 Testing Observations:", format(nrow(test_q1), big.mark = ","), "\n")
2025 Q1 Testing Observations: 152,045 
Code
cat("2025 Q1 Training Date Range:", format(min(train_q1$date), big.mark = ","), "to", format(max(train_q1$date), big.mark = ","), "\n")
2025 Q1 Training Date Range: 2025-01-07 to 2025-03-04 
Code
cat("2025 Q1 Testing Date Range:", format(min(test_q1$date), big.mark = ","), "to", format(max(test_q1$date), big.mark = ","), "\n")
2025 Q1 Testing Date Range: 2025-03-05 to 2025-03-31 

9. Modeling

i. Preparation and Helper Function

Code
# Model predictors.
model_predictors <- c(
  "trips", "hour", "dow", "temp", "precip",
  "lag_1hr", "lag_3hr", "lag_1day",
  "med_inc", "pct_public_commute", "pct_white",
  "start_station_id", "feel", "gust", "w_code",
  "rush_hour", "is_weekend"
)

# Day of week levels.
dow_levels <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")

# 2024 Q4 train set.
train_clean <- train %>%
  mutate(dow_simple = factor(dow, levels = dow_levels)) %>%
  drop_na(all_of(model_predictors))

contrasts(train_clean$dow_simple) <- contr.treatment(7)

# 2024 Q4 test set.
test_clean <- test %>%
  mutate(dow_simple = factor(dow, levels = dow_levels)) %>%
  drop_na(all_of(model_predictors))

# 2025 Q1 train set.
train_q1_clean <- train_q1 %>%
  mutate(dow_simple = factor(dow, levels = dow_levels)) %>%
  drop_na(all_of(model_predictors))

contrasts(train_q1_clean$dow_simple) <- contr.treatment(7)

# 2025 Q1 test set.
test_q1_clean <- test_q1 %>%
  mutate(dow_simple = factor(dow, levels = dow_levels)) %>%
  drop_na(all_of(model_predictors))

cat("\nFinal Clean Dataset Dimensions:\n")

Final Clean Dataset Dimensions:
Code
cat("2024 Q4 Train:", dim(train_clean), "\n")
2024 Q4 Train: 170375 37 
Code
cat("2024 Q4 Test:", dim(test_clean), "\n")
2024 Q4 Test: 105985 37 
Code
cat("2025 Q1 Train:", dim(train_q1_clean), "\n")
2025 Q1 Train: 314195 37 
Code
cat("2025 Q1 Test:", dim(test_q1_clean), "\n")
2025 Q1 Test: 151575 37 
Code
get_metrics <- function(model, model_name) {
  # Get model summary.
  model_summary <- summary(model)

  # Get R-Squared and Adjusted R-Squared.
  adj_r_squared <- model_summary$adj.r.squared

  # Get AIC.
  aic <- AIC(model)

  # Store results in tibble.
  metrics <- tibble(
    model = model_name,
    adjusted_r_squared = adj_r_squared,
    aic = aic
  )
  return(metrics)
}

ii. 2024 Q4 Models

Code
# Fit model.
model1_q4 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip,
  data = train_clean
)

summary(model1_q4)

Call:
lm(formula = trips ~ as.factor(hour) + dow_simple + temp + precip, 
    data = train_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.5777 -0.6422 -0.2163  0.1558 27.3544 

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)       -0.1430562  0.0217757  -6.570     0.00000000005061 ***
as.factor(hour)1  -0.1180403  0.0186562  -6.327     0.00000000025038 ***
as.factor(hour)2  -0.2097570  0.0186531 -11.245 < 0.0000000000000002 ***
as.factor(hour)3  -0.2987309  0.0186626 -16.007 < 0.0000000000000002 ***
as.factor(hour)4  -0.3793297  0.0188334 -20.141 < 0.0000000000000002 ***
as.factor(hour)5  -0.4638038  0.0188616 -24.590 < 0.0000000000000002 ***
as.factor(hour)6  -0.5109719  0.0188255 -27.142 < 0.0000000000000002 ***
as.factor(hour)7  -0.5453174  0.0188254 -28.967 < 0.0000000000000002 ***
as.factor(hour)8  -0.6067859  0.0188053 -32.267 < 0.0000000000000002 ***
as.factor(hour)9  -0.6061100  0.0188391 -32.173 < 0.0000000000000002 ***
as.factor(hour)10 -0.4829160  0.0188981 -25.554 < 0.0000000000000002 ***
as.factor(hour)11 -0.2615281  0.0189850 -13.775 < 0.0000000000000002 ***
as.factor(hour)12  0.0393385  0.0190396   2.066             0.038816 *  
as.factor(hour)13  0.0789763  0.0191002   4.135     0.00003553704609 ***
as.factor(hour)14 -0.1113874  0.0191298  -5.823     0.00000000580041 ***
as.factor(hour)15 -0.1662472  0.0191074  -8.701 < 0.0000000000000002 ***
as.factor(hour)16 -0.0564510  0.0190538  -2.963             0.003050 ** 
as.factor(hour)17  0.0209120  0.0189714   1.102             0.270338    
as.factor(hour)18  0.0817785  0.0189016   4.327     0.00001515651955 ***
as.factor(hour)19  0.1731962  0.0188796   9.174 < 0.0000000000000002 ***
as.factor(hour)20  0.3494276  0.0188502  18.537 < 0.0000000000000002 ***
as.factor(hour)21  0.6014555  0.0188368  31.930 < 0.0000000000000002 ***
as.factor(hour)22  0.5135560  0.0188248  27.281 < 0.0000000000000002 ***
as.factor(hour)23  0.2006940  0.0186627  10.754 < 0.0000000000000002 ***
dow_simple2        0.0133471  0.0106302   1.256             0.209270    
dow_simple3        0.0700472  0.0101498   6.901     0.00000000000517 ***
dow_simple4       -0.0283340  0.0101622  -2.788             0.005301 ** 
dow_simple5       -0.0037453  0.0107601  -0.348             0.727785    
dow_simple6       -0.0034300  0.0107996  -0.318             0.750783    
dow_simple7        0.0353427  0.0105020   3.365             0.000765 ***
temp               0.0157214  0.0003068  51.250 < 0.0000000000000002 ***
precip             1.0015075  0.3588321   2.791             0.005255 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.125 on 170343 degrees of freedom
Multiple R-squared:  0.1097,    Adjusted R-squared:  0.1095 
F-statistic:   677 on 31 and 170343 DF,  p-value: < 0.00000000000000022
Code
# Get metrics.
metrics1_q4 <- get_metrics(model1_q4, "Model 1 2024 Q4")

# Predict test set and calculate MAE.
test_clean$pred1 <- predict(model1_q4, newdata = test_clean)
mae_value <- mean(abs(test_clean$trips - test_clean$pred1), na.rm = TRUE)

# Add MAE to metrics table.
metrics1_q4 <- metrics1_q4 %>% mutate(mae = mae_value)
Code
# Fit model.
model2_q4 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    lag_1hr + lag_3hr + lag_1day,
  data = train_clean
)

summary(model2_q4)

Call:
lm(formula = trips ~ as.factor(hour) + dow_simple + temp + precip + 
    lag_1hr + lag_3hr + lag_1day, data = train_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-13.0643  -0.4332  -0.1172   0.0780  25.6134 

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)       -0.2114799  0.0189201 -11.178 < 0.0000000000000002 ***
as.factor(hour)1  -0.0219456  0.0161962  -1.355             0.175423    
as.factor(hour)2  -0.0179713  0.0162167  -1.108             0.267779    
as.factor(hour)3  -0.0353060  0.0162562  -2.172             0.029868 *  
as.factor(hour)4  -0.0566840  0.0164310  -3.450             0.000561 ***
as.factor(hour)5  -0.0834783  0.0164821  -5.065     0.00000040929343 ***
as.factor(hour)6  -0.0842748  0.0164793  -5.114     0.00000031578853 ***
as.factor(hour)7  -0.0817915  0.0165097  -4.954     0.00000072721825 ***
as.factor(hour)8  -0.0959344  0.0165348  -5.802     0.00000000656559 ***
as.factor(hour)9  -0.0706680  0.0165968  -4.258     0.00002064397272 ***
as.factor(hour)10  0.0208426  0.0166627   1.251             0.210988    
as.factor(hour)11  0.1498945  0.0167446   8.952 < 0.0000000000000002 ***
as.factor(hour)12  0.3021175  0.0167897  17.994 < 0.0000000000000002 ***
as.factor(hour)13  0.2206304  0.0167942  13.137 < 0.0000000000000002 ***
as.factor(hour)14  0.0486514  0.0167284   2.908             0.003634 ** 
as.factor(hour)15  0.0194684  0.0166374   1.170             0.241939    
as.factor(hour)16  0.1022476  0.0165857   6.165     0.00000000070734 ***
as.factor(hour)17  0.1467333  0.0165391   8.872 < 0.0000000000000002 ***
as.factor(hour)18  0.1707248  0.0164877  10.355 < 0.0000000000000002 ***
as.factor(hour)19  0.2180740  0.0164538  13.254 < 0.0000000000000002 ***
as.factor(hour)20  0.3149035  0.0164337  19.162 < 0.0000000000000002 ***
as.factor(hour)21  0.4482269  0.0164584  27.234 < 0.0000000000000002 ***
as.factor(hour)22  0.3018035  0.0164390  18.359 < 0.0000000000000002 ***
as.factor(hour)23  0.0878708  0.0162305   5.414     0.00000006173642 ***
dow_simple2        0.0084040  0.0092276   0.911             0.362435    
dow_simple3        0.0145596  0.0088135   1.652             0.098542 .  
dow_simple4       -0.0623971  0.0088229  -7.072     0.00000000000153 ***
dow_simple5       -0.0193919  0.0093401  -2.076             0.037878 *  
dow_simple6       -0.0046540  0.0093751  -0.496             0.619598    
dow_simple7        0.0046139  0.0091161   0.506             0.612772    
temp               0.0063979  0.0002696  23.732 < 0.0000000000000002 ***
precip             0.4710113  0.3114191   1.512             0.130417    
lag_1hr            0.2569245  0.0023223 110.632 < 0.0000000000000002 ***
lag_3hr            0.1076861  0.0022363  48.153 < 0.0000000000000002 ***
lag_1day           0.2839577  0.0022333 127.149 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9765 on 170340 degrees of freedom
Multiple R-squared:  0.3295,    Adjusted R-squared:  0.3294 
F-statistic:  2462 on 34 and 170340 DF,  p-value: < 0.00000000000000022
Code
# Get metrics.
metrics2_q4 <- get_metrics(model2_q4, "Model 2 2024 Q4")

# Predict test set and calculate MAE.
test_clean$pred2 <- predict(model2_q4, newdata = test_clean)
mae_value <- mean(abs(test_clean$trips - test_clean$pred2), na.rm = TRUE)

# Add MAE to metrics table.
metrics2_q4 <- metrics2_q4 %>% mutate(mae = mae_value)
Code
# Fit model.
model3_q4 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    lag_1hr + lag_3hr + lag_1day +
    med_inc + pct_public_commute + pct_white,
  data = train_clean
)

summary(model3_q4)

Call:
lm(formula = trips ~ as.factor(hour) + dow_simple + temp + precip + 
    lag_1hr + lag_3hr + lag_1day + med_inc + pct_public_commute + 
    pct_white, data = train_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.6752  -0.4346  -0.1265   0.1352  25.5614 

Coefficients:
                         Estimate     Std. Error t value             Pr(>|t|)
(Intercept)        -0.36514890729  0.02136538011 -17.091 < 0.0000000000000002
as.factor(hour)1   -0.02524000104  0.01614864704  -1.563              0.11806
as.factor(hour)2   -0.02580182445  0.01617064638  -1.596              0.11058
as.factor(hour)3   -0.04637960684  0.01621184952  -2.861              0.00423
as.factor(hour)4   -0.07017533192  0.01638791614  -4.282  0.00001852065766219
as.factor(hour)5   -0.09915577579  0.01644076232  -6.031  0.00000000163187585
as.factor(hour)6   -0.10181651510  0.01643983438  -6.193  0.00000000059058444
as.factor(hour)7   -0.10098814753  0.01647197032  -6.131  0.00000000087569401
as.factor(hour)8   -0.11722804485  0.01649947452  -7.105  0.00000000000120829
as.factor(hour)9   -0.09320029167  0.01656286991  -5.627  0.00000001835918720
as.factor(hour)10  -0.00111350668  0.01662770527  -0.067              0.94661
as.factor(hour)11   0.13035567225  0.01670635192   7.803  0.00000000000000609
as.factor(hour)12   0.28706680035  0.01674672887  17.142 < 0.0000000000000002
as.factor(hour)13   0.20988768797  0.01674793286  12.532 < 0.0000000000000002
as.factor(hour)14   0.03870142229  0.01668192733   2.320              0.02034
as.factor(hour)15   0.01060710603  0.01659058035   0.639              0.52260
as.factor(hour)16   0.09462113927  0.01653844469   5.721  0.00000001058984059
as.factor(hour)17   0.13919677288  0.01649186852   8.440 < 0.0000000000000002
as.factor(hour)18   0.16411698457  0.01644025939   9.983 < 0.0000000000000002
as.factor(hour)19   0.21334025666  0.01640587142  13.004 < 0.0000000000000002
as.factor(hour)20   0.31290060251  0.01638528192  19.096 < 0.0000000000000002
as.factor(hour)21   0.44993043308  0.01640985405  27.418 < 0.0000000000000002
as.factor(hour)22   0.30567605729  0.01639084064  18.649 < 0.0000000000000002
as.factor(hour)23   0.08979242535  0.01618261829   5.549  0.00000002882348671
dow_simple2         0.00897167565  0.00920038334   0.975              0.32949
dow_simple3         0.01692413952  0.00878774645   1.926              0.05412
dow_simple4        -0.06093644312  0.00879691910  -6.927  0.00000000000431310
dow_simple5        -0.01842193844  0.00931258347  -1.978              0.04791
dow_simple6        -0.00415126650  0.00934739641  -0.444              0.65696
dow_simple7         0.00601406463  0.00908929041   0.662              0.50819
temp                0.00675685832  0.00026903104  25.116 < 0.0000000000000002
precip              0.48999501689  0.31049928107   1.578              0.11455
lag_1hr             0.24961626798  0.00232681486 107.278 < 0.0000000000000002
lag_3hr             0.09902222804  0.00224626695  44.083 < 0.0000000000000002
lag_1day            0.27548835768  0.00224249083 122.849 < 0.0000000000000002
med_inc             0.00000037957  0.00000008578   4.425  0.00000964404055519
pct_public_commute -0.04881466120  0.03025634719  -1.613              0.10667
pct_white           0.25344907137  0.01467549318  17.270 < 0.0000000000000002
                      
(Intercept)        ***
as.factor(hour)1      
as.factor(hour)2      
as.factor(hour)3   ** 
as.factor(hour)4   ***
as.factor(hour)5   ***
as.factor(hour)6   ***
as.factor(hour)7   ***
as.factor(hour)8   ***
as.factor(hour)9   ***
as.factor(hour)10     
as.factor(hour)11  ***
as.factor(hour)12  ***
as.factor(hour)13  ***
as.factor(hour)14  *  
as.factor(hour)15     
as.factor(hour)16  ***
as.factor(hour)17  ***
as.factor(hour)18  ***
as.factor(hour)19  ***
as.factor(hour)20  ***
as.factor(hour)21  ***
as.factor(hour)22  ***
as.factor(hour)23  ***
dow_simple2           
dow_simple3        .  
dow_simple4        ***
dow_simple5        *  
dow_simple6           
dow_simple7           
temp               ***
precip                
lag_1hr            ***
lag_3hr            ***
lag_1day           ***
med_inc            ***
pct_public_commute    
pct_white          ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9736 on 170337 degrees of freedom
Multiple R-squared:  0.3335,    Adjusted R-squared:  0.3333 
F-statistic:  2303 on 37 and 170337 DF,  p-value: < 0.00000000000000022
Code
# Get metrics.
metrics3_q4 <- get_metrics(model3_q4, "Model 3 2024 Q4")

# Predict test set and calculate MAE.
test_clean$pred3 <- predict(model3_q4, newdata = test_clean)
mae_value <- mean(abs(test_clean$trips - test_clean$pred3), na.rm = TRUE)

# Add MAE to metrics table.
metrics3_q4 <- metrics3_q4 %>% mutate(mae = mae_value)
Code
# Fit model.
model4_q4 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    lag_1hr + lag_3hr + lag_1day +
    med_inc + pct_public_commute + pct_white +
    as.factor(start_station_id),
  data = train_clean
)

# Summary too long with all station dummies, just show key metrics.
cat("Model 4 R-Squared:", summary(model4_q4)$r.squared, "\n")
Model 4 R-Squared: 0.3559007 
Code
cat("Model 4 Adjusted R-Squared:", summary(model4_q4)$adj.r.squared, "\n")
Model 4 Adjusted R-Squared: 0.3548859 
Code
# Get metrics.
metrics4_q4 <- get_metrics(model4_q4, "Model 4 2024 Q4")

# Predict test set and calculate MAE.
test_clean$pred4 <- predict(model4_q4, newdata = test_clean)
mae_value <- mean(abs(test_clean$trips - test_clean$pred4), na.rm = TRUE)

# Add MAE to metrics table.
metrics4_q4 <- metrics4_q4 %>% mutate(mae = mae_value)
Code
# Fit model.
model5_q4 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    lag_1hr + lag_3hr + lag_1day +
    med_inc + pct_public_commute + pct_white +
    as.factor(start_station_id) +
    rush_hour * is_weekend,
  data = train_clean
)

# Summary too long with all station dummies, just show key metrics.
cat("Model 5 R-Squared:", summary(model5_q4)$r.squared, "\n")
Model 5 R-Squared: 0.3564155 
Code
cat("Model 5 Adjusted R-Squared:", summary(model5_q4)$adj.r.squared, "\n")
Model 5 Adjusted R-Squared: 0.3553977 
Code
# Get metrics.
metrics5_q4 <- get_metrics(model5_q4, "Model 5 2024 Q4")

# Predict test set and calculate MAE.
test_clean$pred5 <- predict(model5_q4, newdata = test_clean)
mae_value <- mean(abs(test_clean$trips - test_clean$pred5), na.rm = TRUE)

# Add MAE to metrics table.
metrics5_q4 <- metrics5_q4 %>% mutate(mae = mae_value)

ii. 2025 Q1 Models

Code
# Fit model.
model1_q1 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip,
  data = train_q1_clean
)

summary(model1_q1)

Call:
lm(formula = trips ~ as.factor(hour) + dow_simple + temp + precip, 
    data = train_q1_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.9301 -0.4097 -0.1844  0.0297 18.5166 

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)        0.2111431  0.0090179  23.414 < 0.0000000000000002 ***
as.factor(hour)1  -0.1166129  0.0094079 -12.395 < 0.0000000000000002 ***
as.factor(hour)2  -0.1861559  0.0094079 -19.787 < 0.0000000000000002 ***
as.factor(hour)3  -0.2135923  0.0094091 -22.701 < 0.0000000000000002 ***
as.factor(hour)4  -0.2493917  0.0094082 -26.508 < 0.0000000000000002 ***
as.factor(hour)5  -0.3005554  0.0094093 -31.942 < 0.0000000000000002 ***
as.factor(hour)6  -0.3352189  0.0094961 -35.301 < 0.0000000000000002 ***
as.factor(hour)7  -0.3608244  0.0095403 -37.821 < 0.0000000000000002 ***
as.factor(hour)8  -0.4019441  0.0095012 -42.305 < 0.0000000000000002 ***
as.factor(hour)9  -0.4028833  0.0094650 -42.566 < 0.0000000000000002 ***
as.factor(hour)10 -0.3561613  0.0094458 -37.706 < 0.0000000000000002 ***
as.factor(hour)11 -0.2070777  0.0094771 -21.850 < 0.0000000000000002 ***
as.factor(hour)12 -0.0596288  0.0094924  -6.282  0.00000000033518396 ***
as.factor(hour)13  0.1732547  0.0095111  18.216 < 0.0000000000000002 ***
as.factor(hour)14 -0.0049037  0.0095230  -0.515               0.6066    
as.factor(hour)15 -0.1237011  0.0095168 -12.998 < 0.0000000000000002 ***
as.factor(hour)16 -0.1023293  0.0095003 -10.771 < 0.0000000000000002 ***
as.factor(hour)17 -0.0154179  0.0094711  -1.628               0.1035    
as.factor(hour)18 -0.0074054  0.0094452  -0.784               0.4330    
as.factor(hour)19  0.0152062  0.0094341   1.612               0.1070    
as.factor(hour)20  0.1028204  0.0094212  10.914 < 0.0000000000000002 ***
as.factor(hour)21  0.1997144  0.0094140  21.215 < 0.0000000000000002 ***
as.factor(hour)22  0.3587005  0.0094116  38.112 < 0.0000000000000002 ***
as.factor(hour)23  0.1597916  0.0093697  17.054 < 0.0000000000000002 ***
dow_simple2        0.0411101  0.0051226   8.025  0.00000000000000102 ***
dow_simple3        0.0319897  0.0051432   6.220  0.00000000049838708 ***
dow_simple4        0.0110736  0.0051312   2.158               0.0309 *  
dow_simple5        0.0376504  0.0051110   7.367  0.00000000000017555 ***
dow_simple6       -0.0583071  0.0051204 -11.387 < 0.0000000000000002 ***
dow_simple7       -0.0997147  0.0051268 -19.450 < 0.0000000000000002 ***
temp               0.0066492  0.0001625  40.912 < 0.0000000000000002 ***
precip            -2.7661939  0.1979613 -13.973 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.763 on 314163 degrees of freedom
Multiple R-squared:  0.07947,   Adjusted R-squared:  0.07938 
F-statistic: 874.9 on 31 and 314163 DF,  p-value: < 0.00000000000000022
Code
# Get metrics.
metrics1_q1 <- get_metrics(model1_q1, "Model 1 2025 Q1")

# Predict test set and calculate MAE.
test_q1_clean$pred1 <- predict(model1_q1, newdata = test_q1_clean)
mae_value <- mean(abs(test_q1_clean$trips - test_q1_clean$pred1), na.rm = TRUE)

# Add MAE to metrics table.
metrics1_q1 <- metrics1_q1 %>% mutate(mae = mae_value)
Code
# Fit model.
model2_q1 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    lag_1hr + lag_3hr + lag_1day,
  data = train_q1_clean
)

summary(model2_q1)

Call:
lm(formula = trips ~ as.factor(hour) + dow_simple + temp + precip + 
    lag_1hr + lag_3hr + lag_1day, data = train_q1_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.8429 -0.2968 -0.1005  0.0221 17.7809 

Coefficients:
                   Estimate Std. Error t value             Pr(>|t|)    
(Intercept)        0.058690   0.008267   7.099 0.000000000001254933 ***
as.factor(hour)1  -0.070307   0.008596  -8.179 0.000000000000000287 ***
as.factor(hour)2  -0.076628   0.008599  -8.912 < 0.0000000000000002 ***
as.factor(hour)3  -0.065563   0.008608  -7.616 0.000000000000026216 ***
as.factor(hour)4  -0.074295   0.008618  -8.621 < 0.0000000000000002 ***
as.factor(hour)5  -0.098671   0.008630 -11.434 < 0.0000000000000002 ***
as.factor(hour)6  -0.109966   0.008718 -12.614 < 0.0000000000000002 ***
as.factor(hour)7  -0.115697   0.008768 -13.196 < 0.0000000000000002 ***
as.factor(hour)8  -0.132263   0.008747 -15.122 < 0.0000000000000002 ***
as.factor(hour)9  -0.122236   0.008723 -14.013 < 0.0000000000000002 ***
as.factor(hour)10 -0.081937   0.008708  -9.409 < 0.0000000000000002 ***
as.factor(hour)11  0.026417   0.008731   3.026             0.002482 ** 
as.factor(hour)12  0.104707   0.008734  11.989 < 0.0000000000000002 ***
as.factor(hour)13  0.242999   0.008745  27.786 < 0.0000000000000002 ***
as.factor(hour)14  0.032231   0.008722   3.695             0.000220 ***
as.factor(hour)15 -0.034123   0.008698  -3.923 0.000087361222669308 ***
as.factor(hour)16 -0.012587   0.008681  -1.450             0.147082    
as.factor(hour)17  0.064224   0.008656   7.420 0.000000000000117698 ***
as.factor(hour)18  0.059650   0.008636   6.907 0.000000000004946720 ***
as.factor(hour)19  0.076787   0.008624   8.904 < 0.0000000000000002 ***
as.factor(hour)20  0.131072   0.008608  15.226 < 0.0000000000000002 ***
as.factor(hour)21  0.183589   0.008606  21.333 < 0.0000000000000002 ***
as.factor(hour)22  0.283487   0.008616  32.904 < 0.0000000000000002 ***
as.factor(hour)23  0.085226   0.008563   9.953 < 0.0000000000000002 ***
dow_simple2       -0.002287   0.004679  -0.489             0.625015    
dow_simple3       -0.017683   0.004700  -3.762             0.000168 ***
dow_simple4       -0.030775   0.004688  -6.564 0.000000000052401433 ***
dow_simple5       -0.004898   0.004668  -1.049             0.294036    
dow_simple6       -0.086245   0.004681 -18.423 < 0.0000000000000002 ***
dow_simple7       -0.082461   0.004683 -17.608 < 0.0000000000000002 ***
temp               0.003147   0.000149  21.118 < 0.0000000000000002 ***
precip            -2.618734   0.180729 -14.490 < 0.0000000000000002 ***
lag_1hr            0.229769   0.001729 132.899 < 0.0000000000000002 ***
lag_3hr            0.098466   0.001700  57.922 < 0.0000000000000002 ***
lag_1day           0.238895   0.001713 139.457 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6963 on 314160 degrees of freedom
Multiple R-squared:  0.2334,    Adjusted R-squared:  0.2333 
F-statistic:  2813 on 34 and 314160 DF,  p-value: < 0.00000000000000022
Code
# Get metrics.
metrics2_q1 <- get_metrics(model2_q1, "Model 2 2025 Q1")

# Predict test set and calculate MAE.
test_q1_clean$pred2 <- predict(model2_q1, newdata = test_q1_clean)
mae_value <- mean(abs(test_q1_clean$trips - test_q1_clean$pred2), na.rm = TRUE)

# Add MAE to metrics table.
metrics2_q1 <- metrics2_q1 %>% mutate(mae = mae_value)
Code
# Fit model.
model3_q1 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    lag_1hr + lag_3hr + lag_1day +
    med_inc + pct_public_commute + pct_white,
  data = train_q1_clean
)

summary(model3_q1)

Call:
lm(formula = trips ~ as.factor(hour) + dow_simple + temp + precip + 
    lag_1hr + lag_3hr + lag_1day + med_inc + pct_public_commute + 
    pct_white, data = train_q1_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.7032 -0.3026 -0.1062  0.0473 17.7465 

Coefficients:
                         Estimate     Std. Error t value             Pr(>|t|)
(Intercept)        -0.05417948554  0.00975080228  -5.556     0.00000002755970
as.factor(hour)1   -0.07115711011  0.00856830626  -8.305 < 0.0000000000000002
as.factor(hour)2   -0.08073352425  0.00857142685  -9.419 < 0.0000000000000002
as.factor(hour)3   -0.07182399235  0.00858187457  -8.369 < 0.0000000000000002
as.factor(hour)4   -0.08212456019  0.00859154018  -9.559 < 0.0000000000000002
as.factor(hour)5   -0.10782112675  0.00860428036 -12.531 < 0.0000000000000002
as.factor(hour)6   -0.12007702655  0.00869255187 -13.814 < 0.0000000000000002
as.factor(hour)7   -0.12674328902  0.00874300468 -14.497 < 0.0000000000000002
as.factor(hour)8   -0.14452330408  0.00872263252 -16.569 < 0.0000000000000002
as.factor(hour)9   -0.13509984909  0.00869932730 -15.530 < 0.0000000000000002
as.factor(hour)10  -0.09474602625  0.00868486905 -10.909 < 0.0000000000000002
as.factor(hour)11   0.01489156814  0.00870694313   1.710             0.087210
as.factor(hour)12   0.09561514749  0.00870793926  10.980 < 0.0000000000000002
as.factor(hour)13   0.23755101955  0.00871810724  27.248 < 0.0000000000000002
as.factor(hour)14   0.02862561113  0.00869453007   3.292             0.000994
as.factor(hour)15  -0.03870307777  0.00867016672  -4.464     0.00000804951797
as.factor(hour)16  -0.01585104489  0.00865378001  -1.832             0.066999
as.factor(hour)17   0.06045247465  0.00862848596   7.006     0.00000000000245
as.factor(hour)18   0.05570865104  0.00860837006   6.471     0.00000000009721
as.factor(hour)19   0.07315837118  0.00859642329   8.510 < 0.0000000000000002
as.factor(hour)20   0.12908561033  0.00858073504  15.044 < 0.0000000000000002
as.factor(hour)21   0.18317790843  0.00857818178  21.354 < 0.0000000000000002
as.factor(hour)22   0.28527053359  0.00858798569  33.217 < 0.0000000000000002
as.factor(hour)23   0.08733232658  0.00853563963  10.231 < 0.0000000000000002
dow_simple2        -0.00038966025  0.00466388331  -0.084             0.933416
dow_simple3        -0.01555183057  0.00468498364  -3.320             0.000902
dow_simple4        -0.02898413668  0.00467333548  -6.202     0.00000000055811
dow_simple5        -0.00298795189  0.00465308150  -0.642             0.520780
dow_simple6        -0.08513217168  0.00466647682 -18.243 < 0.0000000000000002
dow_simple7        -0.08337249792  0.00466806093 -17.860 < 0.0000000000000002
temp                0.00330278285  0.00014857582  22.230 < 0.0000000000000002
precip             -2.62722396292  0.18014746580 -14.584 < 0.0000000000000002
lag_1hr             0.22194377845  0.00173204135 128.140 < 0.0000000000000002
lag_3hr             0.08969390434  0.00170560273  52.588 < 0.0000000000000002
lag_1day            0.23022156905  0.00171829772 133.982 < 0.0000000000000002
med_inc             0.00000024919  0.00000004501   5.536     0.00000003101851
pct_public_commute -0.02166577108  0.01588193711  -1.364             0.172513
pct_white           0.19394421855  0.00769408881  25.207 < 0.0000000000000002
                      
(Intercept)        ***
as.factor(hour)1   ***
as.factor(hour)2   ***
as.factor(hour)3   ***
as.factor(hour)4   ***
as.factor(hour)5   ***
as.factor(hour)6   ***
as.factor(hour)7   ***
as.factor(hour)8   ***
as.factor(hour)9   ***
as.factor(hour)10  ***
as.factor(hour)11  .  
as.factor(hour)12  ***
as.factor(hour)13  ***
as.factor(hour)14  ***
as.factor(hour)15  ***
as.factor(hour)16  .  
as.factor(hour)17  ***
as.factor(hour)18  ***
as.factor(hour)19  ***
as.factor(hour)20  ***
as.factor(hour)21  ***
as.factor(hour)22  ***
as.factor(hour)23  ***
dow_simple2           
dow_simple3        ***
dow_simple4        ***
dow_simple5           
dow_simple6        ***
dow_simple7        ***
temp               ***
precip             ***
lag_1hr            ***
lag_3hr            ***
lag_1day           ***
med_inc            ***
pct_public_commute    
pct_white          ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.694 on 314157 degrees of freedom
Multiple R-squared:  0.2383,    Adjusted R-squared:  0.2382 
F-statistic:  2657 on 37 and 314157 DF,  p-value: < 0.00000000000000022
Code
# Get metrics.
metrics3_q1 <- get_metrics(model3_q1, "Model 3 2025 Q1")

# Predict test set and calculate MAE.
test_q1_clean$pred3 <- predict(model3_q1, newdata = test_q1_clean)
mae_value <- mean(abs(test_q1_clean$trips - test_q1_clean$pred3), na.rm = TRUE)

# Add MAE to metrics table.
metrics3_q1 <- metrics3_q1 %>% mutate(mae = mae_value)
Code
# Fit model.
model4_q1 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    lag_1hr + lag_3hr + lag_1day +
    med_inc + pct_public_commute + pct_white +
    as.factor(start_station_id),
  data = train_q1_clean
)

# Summary too long with all station dummies, just show key metrics.
cat("Model 4 R-Squared:", summary(model4_q1)$r.squared, "\n")
Model 4 R-Squared: 0.2660519 
Code
cat("Model 4 Adjusted R-Squared:", summary(model4_q1)$adj.r.squared, "\n")
Model 4 Adjusted R-Squared: 0.2654253 
Code
# Get metrics.
metrics4_q1 <- get_metrics(model4_q1, "Model 4 2025 Q1")

# Predict test set and calculate MAE.
test_q1_clean$pred4 <- predict(model4_q1, newdata = test_q1_clean)
mae_value <- mean(abs(test_q1_clean$trips - test_q1_clean$pred4), na.rm = TRUE)

# Add MAE to metrics table.
metrics4_q1 <- metrics4_q1 %>% mutate(mae = mae_value)
Code
# Fit model.
model5_q1 <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    lag_1hr + lag_3hr + lag_1day +
    med_inc + pct_public_commute + pct_white +
    as.factor(start_station_id) +
    rush_hour * is_weekend,
  data = train_q1_clean
)

# Summary too long with all station dummies, just show key metrics.
cat("Model 5 R-Squared:", summary(model5_q1)$r.squared, "\n")
Model 5 R-Squared: 0.2670996 
Code
cat("Model 5 Adjusted R-Squared:", summary(model5_q1)$adj.r.squared, "\n")
Model 5 Adjusted R-Squared: 0.2664715 
Code
# Get metrics.
metrics5_q1 <- get_metrics(model5_q1, "Model 5 2025 Q1")

# Predict test set and calculate MAE.
test_q1_clean$pred5 <- predict(model5_q1, newdata = test_q1_clean)
mae_value <- mean(abs(test_q1_clean$trips - test_q1_clean$pred5), na.rm = TRUE)

# Add MAE to metrics table.
metrics5_q1 <- metrics5_q1 %>% mutate(mae = mae_value)
Code
# Combine all 10 model metrics tables.
final_metrics <- bind_rows(
  metrics1_q4, metrics2_q4, metrics3_q4, metrics4_q4, metrics5_q4,
  metrics1_q1, metrics2_q1, metrics3_q1, metrics4_q1, metrics5_q1
)

final_metrics <- final_metrics %>%
  arrange(mae)

# Create kable.
table_final <- final_metrics %>%
  mutate(
    adjusted_r_squared = round(adjusted_r_squared, 4),
    aic = scales::comma(round(aic, 0)),
    mae = round(mae, 4)
  ) %>%
  select(model, mae, adjusted_r_squared, aic) %>%
  kable(
    caption = "Model Performance Comparison (2024 Q4 vs. 2025 Q1)",
    col.names = c("Model and Features", "MAE (Trips / Hour)", "Adjusted R²", "AIC"),
    align = c("l", "c", "c", "c", "c")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"), 
    full_width = FALSE
  ) %>%
  footnote(
    general = "Average predicted trip count error per station-hour.\nSorted ascending.",
    general_title = "MAE (Mean Absolute Error): Lower is better.",
    footnote_as_chunk = FALSE
  )

table_final
Model Performance Comparison (2024 Q4 vs. 2025 Q1)
Model and Features MAE (Trips / Hour) Adjusted R² AIC
Model 2 2024 Q4 0.3934 0.3294 475,425
Model 3 2024 Q4 0.4006 0.3333 474,420
Model 4 2024 Q4 0.4291 0.3549 469,046
Model 5 2024 Q4 0.4297 0.3554 468,912
Model 1 2024 Q4 0.5102 0.1095 523,727
Model 4 2025 Q1 0.5257 0.2654 651,003
Model 5 2025 Q1 0.5260 0.2665 650,556
Model 3 2025 Q1 0.5322 0.2382 662,189
Model 2 2025 Q1 0.5338 0.2333 664,213
Model 1 2025 Q1 0.6195 0.0794 721,698
MAE (Mean Absolute Error): Lower is better.
Average predicted trip count error per station-hour.
Sorted ascending.
Code
# Prepare data.
mae_plot_data <- final_metrics %>%
  mutate(
    # Label for x-axis.
    model_label = case_when(
      grepl("Model 1", model) ~ "1. Time / Weather",
      grepl("Model 2", model) ~ "2. + Lags",
      grepl("Model 3", model) ~ "3. + Demographics",
      grepl("Model 4", model) ~ "4. + Station FE",
      grepl("Model 5", model) ~ "5. + Interaction"
    ),
    # Forecast for faceting.
    forecast_type = ifelse(
      grepl("2024 Q4", model),
      "2024 Q4",
      "2025 Q1"
    )
  )

# Create MAE comparison plot.
mae_comp_plot <- ggplot(mae_plot_data, aes(
  x = reorder(model_label, -mae),
  y = mae,
  fill = forecast_type
)) +
  geom_col(alpha = 0.9,
    colour = "#3d3b3c",
    linewidth = 1) +
  geom_text(
    aes(label = round(mae, 3)),
    vjust = -0.3,
    size = 3.5,
    color = "#2d2a26",
    family = "anonymous"
  ) +
  facet_wrap(~ forecast_type, scales = "free_x", ncol = 2) +
  scale_fill_manual(values = c(
    "2024 Q4" = "#f6a2a7",
    "2025 Q1" = "#8bcef2"
  )) +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  labs(
    title = "Model Prediction Accuracy Comparison",
    subtitle = "Mean Absolute Error (MAE): 2024 Q4 (2024 Q4) vs. 2025 Q1 models",
    x = "Model Complexity",
    y = "Mean Absolute Error (Trips per Hour)",
    fill = NULL
  ) +
  theme_plot() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
    legend.position = "none",
    strip.text = element_text(
      face = "bold",
      size = 10,
      family = "outfit",
      color = "#2d2a26"
    )
  )

mae_comp_plot

  1. Compare results to 2025 Q1:

    • How do MAE values compare? Why might they differ?
    • Are temporal patterns different (e.g., summer vs. winter)?
    • Which features are most important in your quarter?

PART II: ERROR ANALYSIS

1. Spatial Patterns

Code
# Get predictions on test set.
# Create day of week factor with dummy coding.
test_clean <- test_clean %>%
  mutate(dow_simple = factor(dow, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))

# Set contrasts to treatment coding.
contrasts(test_clean$dow_simple) <- contr.treatment(7)

test_clean <- test_clean %>%
  mutate(
    model2_q4_pred = predict(model2_q4, newdata = test_clean)
  )

# Prepare error data.
test_q4_error <- test_clean %>%
  mutate(
    error = trips - model2_q4_pred,
    abs_error = abs(error),
    time_of_day = case_when(
      hour < 7 ~ "Overnight",
      hour >= 7 & hour < 10 ~ "AM Rush",
      hour >= 10 & hour < 15 ~ "Mid-Day",
      hour >= 15 & hour <= 18 ~ "PM Rush",
      hour > 18 ~ "Evening"
    )
  )

# Calculate station-level MAE and demand.
station_errors_q4 <- test_q4_error %>%
  group_by(start_station_id, start_lat, start_lon) %>%
  summarize(
    MAE = mean(abs_error, na.rm = TRUE),
    avg_demand = mean(trips, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(!is.na(start_lat), !is.na(start_lon))

# Prediction errors.
pred_error_q4_map <- ggplot() +
  geom_sf(data = philly_census, fill = "#2d2a26", color = "#ff4100", linewidth = 0.35) +
  geom_point(
    data = station_errors_q4,
    aes(x = start_lon, y = start_lat, color = MAE),
    size = 2,
    alpha = 0.75
  ) +
  scale_color_viridis(
    option = "mako",
    name = "MAE (Trips)",
    direction = -1,
    breaks = c(0.5, 1.0, 1.5)
  ) +
  labs(
    title = "Model Prediction Errors",
    subtitle = "MAE per Station (Q4 2024 Test Set)"
  ) +
  theme_map()

# Average demand.
avg_dem_q4_map <- ggplot() +
  geom_sf(data = philly_census, fill = "#2d2a26", color = "#ff4100", linewidth = 0.35) +
  geom_point(
    data = station_errors_q4,
    aes(x = start_lon, y = start_lat, color = avg_demand),
    size = 2,
    alpha = 0.75
  ) +
  scale_color_viridis(
    option = "mako",
    name = "Average Demand (Trips per Hour)",
    direction = -1
  ) +
  labs(
    title = "Average Station Demand",
    subtitle = "Trips per Station-Hour (2024 Q4 Test Set)"
  ) +
  theme_map()

# Combine maps.
pred_error_q4_map | avg_dem_q4_map

-   Create error maps
-   Identify neighborhoods with high errors
-   Hypothesize why (missing features? different demand patterns?)

2. Temporal Patterns

Code
# Observed vs predicted scatter plot.
obs_vs_pred_plot <- ggplot(test_q4_error, aes(x = trips, y = model2_q4_pred)) +
  geom_point(alpha = 0.25, color = "#1492D3") +
  geom_abline(slope = 1, intercept = 0, color = "#F03E36", linewidth = 1, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, color = "#00a557", linetype = "dashed") +
  facet_grid(is_weekend ~ time_of_day) +
  labs(
    title = "Observed vs. Predicted Bike Trips (2024 Q4 Model 2)",
    subtitle = "Red Line = Perfect Predictions\nGreen Line = OLS Fit",
    caption = "Performance grouped by time of day and weekday / weekend.",
    x = "Observed Trips (Trips per Hour)",
    y = "Predicted Trips (Model 2)"
  ) +
  theme_plot() +
  theme(
    axis.text.x = element_text(hjust = 1, size = 8),
    legend.position = "none",
    strip.text = element_text(
      face = "bold",
      size = 10,
      family = "anonymous",
      color = "#2d2a26"
    )
  )

obs_vs_pred_plot

Code
# Calculate MAE by time of day and weekday/weekend.
temporal_errors_q4 <- test_q4_error %>%
  group_by(time_of_day, is_weekend) %>%
  summarize(
    MAE = mean(abs_error, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(day_type = ifelse(is_weekend, "Weekend", "Weekday"))

temp_error_plot <- ggplot(temporal_errors_q4, aes(x = time_of_day, y = MAE, fill = day_type)) +
  geom_col(position = "dodge",
    colour = "#3d3b3c",
    linewidth = 1) +
  scale_fill_manual(values = c("Weekday" = "#f6a2a7", "Weekend" = "#8bcef2")) +
  labs(
    title = "Prediction Errors by Time Period",
    subtitle = "2024 Q4",
    caption = "MAE is highest during commuter peaks.",
    x = "Time of Day.",
    y = "Mean Absolute Error (Trips).",
    fill = "Day Type"
  ) +
  theme_plot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

temp_error_plot

-   When are errors highest?
-   Do certain hours/days have systematic under/over-prediction?
-   Are there seasonal patterns?

3. Demographic Patterns

Code
# Attach census variables to station-level errors.
station_errors_demo_q4 <- station_errors_q4 %>%
  left_join(
    station_census_lookup %>%
      select(start_station_id, med_inc, pct_public_commute, pct_white),
    by = "start_station_id"
  ) %>%
  filter(!is.na(med_inc))

# MAE vs Median Income
mae_inc_q4_map <- ggplot(station_errors_demo_q4, aes(x = med_inc, y = MAE)) +
  geom_point(alpha = 0.5, color = "#011f5b") +
  geom_smooth(method = "lm", se = FALSE, color = "#990000", linetype = "dashed") +
  scale_x_continuous(labels = scales::dollar) +
  labs(
    title = "Prediction Errors vs. Median Income",
    subtitle = "2024 Q4",
    x = "Median Income.",
    y = "MAE"
  ) +
  theme_plot()

# MAE vs Public Transit Usage
mae_pub_q4_map <- ggplot(station_errors_demo_q4, aes(x = pct_public_commute, y = MAE)) +
  geom_point(alpha = 0.5, color = "#011f5b") +
  geom_smooth(method = "lm", se = FALSE, color = "#990000", linetype = "dashed") +
  labs(
    title = "Prediction Errors vs. Transit Usage",
    subtitle = "2024 Q4",
    x = "% Taking Transit.",
    y = "MAE"
  ) +
  theme_plot()

# MAE vs Percent White
mae_yt_q4_demo <- ggplot(station_errors_demo_q4, aes(x = pct_white, y = MAE)) +
  geom_point(alpha = 0.5, color = "#011f5b") +
  geom_smooth(method = "lm", se = FALSE, color = "#990000", linetype = "dashed") +
  labs(
    title = "Prediction Errors vs. Percent White",
    subtitle = "2024 Q4",
    x = "% White",
    y = "MAE"
  ) +
  theme_plot()

mae_inc_q4_map / mae_pub_q4_map / mae_yt_q4_demo

-   Relate errors to census characteristics
-   Are certain communities systematically harder to predict?
-   What are the equity implications?

PART III: MODEL IMPROVEMENT

Code
# Create a variable to represent the time passing into Q4.
# Create a variable to represent very cold temperatures.
start_of_q4 <- as.Date("2024-10-01")

train_clean <- train_clean %>%
  mutate(
    # Days passed since start Q4 start.
    days_since_oct1 = as.numeric(date - start_of_q4),
    is_too_cold = ifelse(temp < 37, 1, 0)
    )

test_clean <- test_clean %>%
  mutate(
    # Days passed since start Q4 start.
    days_since_oct1 = as.numeric(date - start_of_q4),
    # Cold weather binary where 1 means temp is below 40.
    is_too_cold = ifelse(temp < 37, 1, 0)
    )
Code
# Play around with predictor columns.
update_model <- lm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    # New variables.
     days_since_oct1 + is_too_cold +
    lag_1hr + lag_3hr + lag_1day +
    (is_too_cold * days_since_oct1),
  data = train_clean
  )

# Compare updated best model to best model.
test_clean$pred_update <- predict(update_model, newdata = test_clean)
mae_update <- mean(abs(test_clean$trips - test_clean$pred_update), na.rm = TRUE)
mae_model2 <- mean(abs(test_clean$trips - test_clean$pred2), na.rm = TRUE)

cat("Model 2 MAE:", round(mae_model2, 4), "\n")
Model 2 MAE: 0.3934 
Code
cat("Updated Model 2 MAE:", round(mae_update, 4), "\n")
Updated Model 2 MAE: 0.3838 
Code
cat("Improvement:", round(mae_model2 - mae_update, 4), "trips per hour\n")
Improvement: 0.0096 trips per hour
Code
cat("Percent Improvement:", round(100 * (mae_model2 - mae_update) / mae_model2, 2), "%\n")
Percent Improvement: 2.44 %
Code
# Poisson regression.
poisson_model <- glm(
  trips ~ as.factor(hour) + dow_simple + temp + precip +
    # New variables.
    days_since_oct1 + is_too_cold +
    lag_1hr + lag_3hr + lag_1day +
    (is_too_cold * days_since_oct1),
  data = train_clean,
  family = poisson(link = "log")
)

summary(poisson_model)

Call:
glm(formula = trips ~ as.factor(hour) + dow_simple + temp + precip + 
    days_since_oct1 + is_too_cold + lag_1hr + lag_3hr + lag_1day + 
    (is_too_cold * days_since_oct1), family = poisson(link = "log"), 
    data = train_clean)

Coefficients:
                              Estimate Std. Error z value             Pr(>|z|)
(Intercept)                 -0.9131170  0.0419595 -21.762 < 0.0000000000000002
as.factor(hour)1            -0.1369465  0.0221828  -6.174 0.000000000667716103
as.factor(hour)2            -0.2672919  0.0236286 -11.312 < 0.0000000000000002
as.factor(hour)3            -0.4709102  0.0257900 -18.259 < 0.0000000000000002
as.factor(hour)4            -0.7826789  0.0297718 -26.289 < 0.0000000000000002
as.factor(hour)5            -1.2230897  0.0354231 -34.528 < 0.0000000000000002
as.factor(hour)6            -1.6233709  0.0426170 -38.092 < 0.0000000000000002
as.factor(hour)7            -1.9311511  0.0495496 -38.974 < 0.0000000000000002
as.factor(hour)8            -2.2568754  0.0572957 -39.390 < 0.0000000000000002
as.factor(hour)9            -1.5739309  0.0416389 -37.800 < 0.0000000000000002
as.factor(hour)10           -0.6227419  0.0279573 -22.275 < 0.0000000000000002
as.factor(hour)11           -0.0324935  0.0227929  -1.426             0.153985
as.factor(hour)12            0.3117467  0.0205260  15.188 < 0.0000000000000002
as.factor(hour)13            0.2954192  0.0203918  14.487 < 0.0000000000000002
as.factor(hour)14            0.0805417  0.0213874   3.766             0.000166
as.factor(hour)15           -0.1035988  0.0220370  -4.701 0.000002587148054995
as.factor(hour)16            0.1311434  0.0209944   6.247 0.000000000419530800
as.factor(hour)17            0.1098964  0.0205198   5.356 0.000000085263552157
as.factor(hour)18            0.2473715  0.0201052  12.304 < 0.0000000000000002
as.factor(hour)19            0.3112235  0.0196725  15.820 < 0.0000000000000002
as.factor(hour)20            0.4199785  0.0189974  22.107 < 0.0000000000000002
as.factor(hour)21            0.5168676  0.0183879  28.109 < 0.0000000000000002
as.factor(hour)22            0.4078067  0.0185956  21.930 < 0.0000000000000002
as.factor(hour)23            0.1897249  0.0194666   9.746 < 0.0000000000000002
dow_simple2                  0.0245547  0.0119060   2.062             0.039173
dow_simple3                  0.0476361  0.0113348   4.203 0.000026383776902398
dow_simple4                 -0.0964824  0.0117776  -8.192 0.000000000000000257
dow_simple5                 -0.0802985  0.0131083  -6.126 0.000000000902451789
dow_simple6                 -0.0186588  0.0131457  -1.419             0.155788
dow_simple7                 -0.0629619  0.0123390  -5.103 0.000000334871976018
temp                         0.0075470  0.0005261  14.345 < 0.0000000000000002
precip                       2.0776681  0.6290327   3.303             0.000957
days_since_oct1             -0.0070479  0.0003491 -20.189 < 0.0000000000000002
is_too_cold                  1.4978745  0.1773336   8.447 < 0.0000000000000002
lag_1hr                      0.1068233  0.0015638  68.311 < 0.0000000000000002
lag_3hr                      0.0831559  0.0015248  54.534 < 0.0000000000000002
lag_1day                     0.1519347  0.0014836 102.409 < 0.0000000000000002
days_since_oct1:is_too_cold -0.0288684  0.0031356  -9.207 < 0.0000000000000002
                               
(Intercept)                 ***
as.factor(hour)1            ***
as.factor(hour)2            ***
as.factor(hour)3            ***
as.factor(hour)4            ***
as.factor(hour)5            ***
as.factor(hour)6            ***
as.factor(hour)7            ***
as.factor(hour)8            ***
as.factor(hour)9            ***
as.factor(hour)10           ***
as.factor(hour)11              
as.factor(hour)12           ***
as.factor(hour)13           ***
as.factor(hour)14           ***
as.factor(hour)15           ***
as.factor(hour)16           ***
as.factor(hour)17           ***
as.factor(hour)18           ***
as.factor(hour)19           ***
as.factor(hour)20           ***
as.factor(hour)21           ***
as.factor(hour)22           ***
as.factor(hour)23           ***
dow_simple2                 *  
dow_simple3                 ***
dow_simple4                 ***
dow_simple5                 ***
dow_simple6                    
dow_simple7                 ***
temp                        ***
precip                      ***
days_since_oct1             ***
is_too_cold                 ***
lag_1hr                     ***
lag_3hr                     ***
lag_1day                    ***
days_since_oct1:is_too_cold ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 277722  on 170374  degrees of freedom
Residual deviance: 191549  on 170337  degrees of freedom
AIC: 317832

Number of Fisher Scoring iterations: 6
Code
# Compare to OLS.
test_clean$pred_poisson <- predict(poisson_model, newdata = test_clean, type = "response")
mae_poisson <- mean(abs(test_clean$trips - test_clean$pred_poisson), na.rm = TRUE)

cat("OLS Updated Model 2 MAE:", round(mae_update, 4), "\n")
OLS Updated Model 2 MAE: 0.3838 
Code
cat("Poisson Model MAE:", round(mae_poisson, 4), "\n")
Poisson Model MAE: 0.3866 

Try a poisson model for count data

  • Does this improve model fit?
Code
# Station improvement.
station_improvements <- test_clean %>%
  group_by(start_station_id, start_lat, start_lon) %>%
  summarize(
    mae_model2 = mean(abs(trips - pred2), na.rm = TRUE),
    mae_update = mean(abs(trips - pred_update), na.rm = TRUE),
    improvement = mae_model2 - mae_update,
    .groups = "drop"
  )

# Improvement map.
improvement_map <- ggplot() +
  geom_sf(data = philly_census, fill = "#4c6e52", color = "#f5f4f0", linewidth = 0.5) +
  geom_point(
    data = station_improvements,
    aes(x = start_lon, y = start_lat, color = improvement, size = abs(improvement)),
    alpha = 0.8
  ) +
  scale_color_gradient2(
    low = "#0889bc", mid = "#fef7d7", high = "#e83b2e",
    midpoint = 0,
    name = "MAE Change"
  )+
  guides(
    size = guide_legend(order = 1),
    color = guide_colorbar(order = 2)
  ) +
  labs(
    title = "Model Improvement by Indego Station",
    subtitle = "Philadelphia, PA",
    caption = "Positive values indicate better predictions with new features.",
    size = "Absolute Improvement"
  ) +
  theme_map() +
  theme(
    legend.box = "vertical"
  )

improvement_map

Code
station_census_lookup$pct_non_yt <- (1 - station_census_lookup$pct_white)

# Census and avg_demand to station-level improvements.
improvement_demo <- station_improvements %>%
  left_join(
    # Demographics.
    station_census_lookup %>% 
      select(start_station_id, med_inc, pct_public_commute, pct_non_yt), 
    by = "start_station_id"
  ) %>%
  left_join(
    # Join average demand data from error summary.
    station_errors_q4 %>% 
      select(start_station_id, avg_demand),
    by = "start_station_id"
  ) %>%
  filter(!is.na(med_inc))

mae_poc_improvement_plot <- ggplot(improvement_demo, aes(x = pct_non_yt, y = improvement)) +
  geom_point(aes(size = avg_demand), alpha = 0.5, color = "#EF4269") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#1A1851", linewidth = 1) +
  geom_smooth(method = "lm", se = TRUE, color = "#C5A453", linetype = "longdash", linewidth = 1) +
  scale_x_continuous(labels = scales::percent) +
  labs(
    title = "Model Improvement by Station",
    subtitle = "% Non-White Population",
    x = "% Non-White",
    y = "MAE Improvement (Trips per Hour)",
    size = "Average Demand"
  ) +
  theme_plot()

mae_inc_improvement_plot <- ggplot(improvement_demo, aes(x = med_inc, y = improvement)) +
  geom_point(aes(size = avg_demand), alpha = 0.5, color = "#EF4269") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#1A1851", linewidth = 1) +
  geom_smooth(method = "lm", se = TRUE, color = "#C5A453", linetype = "longdash", linewidth = 1) +
  scale_x_continuous(labels = scales::dollar) +
  labs(
    title = "Model Improvement by Station",
    subtitle = "Median Income",
    x = "Median Income",
    y = "MAE Improvement (Trips per Hour)",
    size = "Average Demand"
  ) +
  theme_plot()

mae_pub_improvement_plot <- ggplot(improvement_demo, aes(x = pct_public_commute, y = improvement)) +
  geom_point(aes(size = avg_demand), alpha = 0.5, color = "#EF4269") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#1A1851", linewidth = 1) +
  geom_smooth(method = "lm", se = TRUE, color = "#C5A453", linetype = "longdash", linewidth = 1) +
  scale_x_continuous(labels = scales::percent) +
  labs(
    title = "Model Improvement by Station",
    subtitle = "% Public Transit Commuters",
    caption = "Points above the dashed line show improvement (MAE decrease).\nChange in MAE (Old MAE - New MAE) vs. Median Income",
    x = "Median Income",
    y = "MAE Improvement (Trips per Hour)",
    size = "Average Demand"
  ) +
  theme_plot()

mae_poc_improvement_plot / mae_inc_improvement_plot / mae_pub_improvement_plot

Code
temporal_bias_q4 <- test_q4_error %>%
  group_by(hour, is_weekend) %>%
  summarize(
    mean_error = mean(error, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(day_type = ifelse(is_weekend, "Weekend", "Weekday"))

bias_plot <- ggplot(temporal_bias_q4, aes(x = hour, y = mean_error, color = day_type)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#f48f33", linewidth = 1) +
  geom_line(linewidth = 1.5) +
  geom_point(size = 3) +
  scale_color_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
  labs(
    title = "Prediction Bias by Hour and Day Type",
    subtitle = "2024 Q4 Test Set",
    caption = "Positive values mean the model under-predicts (demand > prediction).\nMean Error (Actual Trips - Predicted Trips)",
    x = "Hour of Day",
    y = "Mean Error (Trips per Hour)",
    color = "Day Type"
  ) +
  theme_plot()

bias_plot

Code
test_clean <- test_clean %>%
  mutate(residual = trips - pred_update)

temp_residuals_plot <- ggplot(test_clean, aes(x = temp, y = residual)) +
  geom_point(alpha = 0.1, color = "#EF4269") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#C5A453", linewidth = 1) +
  geom_smooth(method = "loess", color = "#778ac5", se = FALSE, linewidth = 1) +
  labs(
    title = "Model Residuals vs. Temperature",
    subtitle = "Updated OLS Model (2024 Q4 Test Set)",
    caption = "Blue curve is the average bias across temperature.",
    x = "Temperature (°F)",
    y = "Residual (Actual Trips - Predicted Trips)"
  ) +
  theme_plot()

temp_residuals_plot

Implementation:

  • Add your features to the best model
  • Compare MAE before and after
  • Explain why you chose these features
  • Did they improve predictions? Where?

PART IV: CRITICAL REFLECTION

1. Operational Implications

  1. Operational implications:
    • Is your final MAE “good enough” for Indego to use?
    • When do prediction errors cause problems for rebalancing?
    • Would you recommend deploying this system? Under what conditions?

2. Equity Considerations

  1. Equity considerations:
    • Do prediction errors disproportionately affect certain neighborhoods?
    • Could this system worsen existing disparities in bike access?
    • What safeguards would you recommend?

3. Model Limitations

  1. Model limitations:
    • What patterns is your model missing?
    • What assumptions might not hold in real deployment?
    • How would you improve this with more time/data?

Brief report summarizing (with supporting data & visualization):

-   Your quarter and why you chose it
-   Model comparison results
-   Error analysis insights
-   New features you added and why
-   Critical reflection on deployment